From: <bi...@us...> - 2008-08-28 22:03:58
|
Revision: 2587 http://archive-access.svn.sourceforge.net/archive-access/?rev=2587&view=rev Author: binzino Date: 2008-08-28 22:04:08 +0000 (Thu, 28 Aug 2008) Log Message: ----------- Updated use of Hadoop FS utilities to match changed interfaces in Nutch and Hadoop when Nutch upgraded to Hadoop 0.17. Also added a gratuitous log messages when looking for Lucene indexes to open in NutchWaxBean. These messages should help operators avoid common errors where the Nutch index directories are not being found yet NutchBean emits no helpful log messages. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-08-28 21:54:54 UTC (rev 2586) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-08-28 22:04:08 UTC (rev 2587) @@ -21,6 +21,9 @@ import java.lang.reflect.Field; import javax.servlet.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import org.apache.nutch.searcher.NutchBean; import org.apache.nutch.searcher.IndexSearcher; import org.apache.nutch.searcher.Query; @@ -28,6 +31,7 @@ import org.apache.nutch.searcher.Hit; import org.apache.nutch.searcher.Hits; import org.apache.nutch.searcher.Summary; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Closeable; @@ -68,6 +72,7 @@ */ public class NutchWaxBean { + public static final Log LOG = LogFactory.getLog( NutchWaxBean.class ); /** * Static utility class for modifying a NutchBean instance. @@ -79,10 +84,12 @@ * IndexSearcher with one we create that uses * ArchiveParallelReader for searching across parallel indices. */ - public static void modify( NutchBean bean ) + public static boolean modify( NutchBean bean ) { try { + LOG.info( "Modifying NutchBean with NutchWAX extensions..." ); + // First, get the configuration from the bean. Gosh it would be // nice if NutchBean had a getConf() public method, wouldn't it? Field fConf = NutchBean.class.getDeclaredField( "conf" ); @@ -95,20 +102,57 @@ FileSystem fs = FileSystem.get( conf ); - Path dir = new Path( conf.get( "searcher.dir", "crawl") ); + Path dir = new Path( conf.get( "searcher.dir", "crawl") ).makeQualified( fs ); + LOG.info( "Looking for Nutch indexes in: " + dir ); + if ( ! fs.exists( dir ) ) + { + LOG.warn( "Directory does not exist: " + dir ); + LOG.warn( "NutchBean not modified." ); + LOG.warn( "No Nutch indexes will be found and all queries will return no results." ); + + return false; + } + + Path indexesDir = new Path( dir, "pindexes" ).makeQualified(fs); + LOG.info( "Looking for NutchWax parallel indexes in: " + indexesDir ); + if ( ! fs.exists( indexesDir ) ) + { + LOG.warn( "Parallel indexes directory does not exist: " + indexesDir ); + LOG.warn( "NutchBean not modified." ); + + return false; + } + + if ( ! fs.getFileStatus( indexesDir ).isDir( ) ) + { + LOG.warn( "Parallel indexes directory is not a directory: " + indexesDir ); + LOG.warn( "NutchBean not modified." ); + + return false; + } + + FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] indexDirs = HadoopFSUtil.getPaths( fstats ); + + if ( indexDirs.length < 1 ) + { + LOG.info( "No sub-dirs found in parallel indexes directory: " + indexesDir ); + LOG.warn( "NutchBean not modified." ); + + return false; + } - Path indexesDir = new Path( dir, "pindexes" ); - - Path indexDirs[] = fs.listPaths(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); - List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); for ( Path indexDir : indexDirs ) { - Path parallelDirs[] = fs.listPaths( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); + fstats = fs.listStatus( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); + Path parallelDirs[] = HadoopFSUtil.getPaths( fstats ); if ( parallelDirs.length < 1 ) { + LOG.info( "No sub-directories, skipping: " + indexDir ); + continue; } @@ -120,11 +164,20 @@ for ( Path p : parallelDirs ) { + LOG.info( "Adding reader for: " + p ); reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); } readers.add( reader ); } + + if ( readers.size( ) == 0 ) + { + LOG.warn( "No parallel indexes in: " + indexesDir ); + LOG.warn( "NutchBean not modified." ); + + return false; + } MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); @@ -142,6 +195,8 @@ IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean ); fLuceneSearcher.set( searcher, newLuceneSearcher ); fReader .set( searcher, reader ); + + return true; } catch ( Exception e ) { @@ -177,7 +232,7 @@ if ( bean == null ) { - NutchBean.LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); + LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); return ; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |