From: <bi...@us...> - 2008-06-27 00:31:49
|
Revision: 2336 http://archive-access.svn.sourceforge.net/archive-access/?rev=2336&view=rev Author: binzino Date: 2008-06-26 17:31:59 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-06-27 00:31:59 UTC (rev 2336) @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; +import java.lang.reflect.Field; +import javax.servlet.*; + +import org.apache.nutch.searcher.NutchBean; +import org.apache.nutch.searcher.IndexSearcher; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.HitDetails; +import org.apache.nutch.searcher.Hit; +import org.apache.nutch.searcher.Hits; +import org.apache.nutch.searcher.Summary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Closeable; +import org.apache.hadoop.conf.Configuration; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.ArchiveParallelReader; +import org.apache.lucene.index.MultiReader; + +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.indexer.FsDirectory; + +/** + * Utility class to use and extend the NutchBean class for reading + * from parallel indices. + * + * This can be used from the command-line to run test/debug searches, + * the same as NutchBean, but using parallel indices. + * + * NutchWaxBean doesn't extend NutchBean directly since all the good + * stuff inside of NutchBean is declared private. So, we dynamically + * modify a NutchBean instance via reflection to inject our own + * IndexReader that reads from a set of parallel indices. + * + * Before you recoil in horror over this approach, the alternatives + * were none too pretty. Sub-classing won't work since all the + * NutchBean data members are declared private. We could copy the + * NutchBean.java into our own source base and effectively over-write + * the Nutch version when we compile, but that is a maintenance + * headache of extreme magnitude. Plus, we'd probably have to + * copy/past/edit multiple Java source files. + * + * Ideally, Nutch would use some sort of dependency injection system, + * or at least make the NutchBean data members have public get/set + * methods (like a bean should). For now, doing dynamic injection via + * reflection seemed the least obtrusive. + */ +public class NutchWaxBean +{ + + /** + * Static utility class for modifying a NutchBean instance. + */ + public static class NutchBeanModifier + { + /** + * Modify the NutchBean by replacing the IndexReader in its + * IndexSearcher with one we create that uses + * ArchiveParallelReader for searching across parallel indices. + */ + public static void modify( NutchBean bean ) + { + try + { + // First, get the configuration from the bean. Gosh it would be + // nice if NutchBean had a getConf() public method, wouldn't it? + Field fConf = NutchBean.class.getDeclaredField( "conf" ); + fConf.setAccessible( true ); + + // The rest of this code is similar to NutchBean in that it + // looks for a 'pindexes' directory as a sibling of the + // 'indexes' directory that NutchBean finds. + Configuration conf = (Configuration) fConf.get( bean ); + + FileSystem fs = FileSystem.get( conf ); + + Path dir = new Path( conf.get( "searcher.dir", "crawl") ); + + Path indexesDir = new Path( dir, "pindexes" ); + + Path indexDirs[] = fs.listPaths(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + + List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); + + for ( Path indexDir : indexDirs ) + { + Path parallelDirs[] = fs.listPaths( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); + + if ( parallelDirs.length < 1 ) + { + continue; + } + + ArchiveParallelReader reader = new ArchiveParallelReader( ); + + // Sort the parallelDirs so that we add them in order. Order + // matters to the ParallelReader. + Arrays.sort( parallelDirs ); + + for ( Path p : parallelDirs ) + { + reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); + } + + readers.add( reader ); + } + + MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); + + // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection. + Field fSearcher = NutchBean.class.getDeclaredField( "searcher" ); + Field fReader = IndexSearcher.class.getDeclaredField( "reader" ); + Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" ); + + fSearcher .setAccessible( true ); + fReader .setAccessible( true ); + fLuceneSearcher.setAccessible( true ); + + org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader ); + + IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean ); + fLuceneSearcher.set( searcher, newLuceneSearcher ); + fReader .set( searcher, reader ); + } + catch ( Exception e ) + { + throw new RuntimeException( e ); + } + } + } + + /** + * Similar to code in NutchBean. This receives the events from the + * servlet container and modifies the NutchBean instance put there + * by the NutchBeanConstructor listener. For this to work, it must + * be declared after the NutchBeanConstructor in the web.xml file, + * e.g. + * <pre> + * <listener> + * <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> + * <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> + * </listener> + * </pre> + */ + public static class NutchWaxBeanConstructor implements ServletContextListener + { + + public void contextDestroyed( ServletContextEvent sce ) + { + } + + public void contextInitialized( ServletContextEvent sce ) + { + ServletContext app = sce.getServletContext(); + NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY ); + + if ( bean == null ) + { + NutchBean.LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); + + return ; + } + + // Modify the NutchBean. + NutchBeanModifier.modify( bean ); + } + + } + + /** + * Simple command-line driver akin to NutchBean.main that peforms + * the ben modification. Useful for testing and debugging from the + * command-line. + */ + public static void main(String[] args) throws Exception + { + String usage = "NutchWaxBean query"; + + if (args.length == 0) + { + System.err.println(usage); + System.exit(-1); + } + + Configuration conf = NutchConfiguration.create(); + + NutchBean bean = new NutchBean(conf); + NutchBeanModifier.modify( bean ); + + Query query = Query.parse(args[0], conf); + Hits hits = bean.search(query, 10); + System.out.println("Total hits: " + hits.getTotal()); + int length = (int)Math.min(hits.getTotal(), 10); + Hit[] show = hits.getHits(0, length); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + for (int i = 0; i < hits.getLength(); i++) + { + // Use a slightly more verbose output than NutchBean. + System.out.println( " " + + i + + " " + + details[i].getValue( "segment" ) + + " " + + details[i].getValue( "url") + + " " + + details[i].getValue( "archive-digest") + + " " + + java.util.Arrays.asList( details[i].getValues( "date") ) + + "\n" + + summaries[i] ); + } + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |