From: <bi...@us...> - 2009-10-28 22:04:01
|
Revision: 2863 http://archive-access.svn.sourceforge.net/archive-access/?rev=2863&view=rev Author: binzino Date: 2009-10-28 22:03:37 +0000 (Wed, 28 Oct 2009) Log Message: ----------- Removed as all NutchWAX mods/edits have been moved into NutchBean in the Nutch source overlay. Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2009-10-28 21:55:11 UTC (rev 2862) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2009-10-28 22:03:37 UTC (rev 2863) @@ -1,296 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.nutchwax; - -//import java.io.*; -import java.util.*; -import java.lang.reflect.Field; -import javax.servlet.ServletContext; -import javax.servlet.ServletContextEvent; -import javax.servlet.ServletContextListener; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.apache.nutch.searcher.NutchBean; -import org.apache.nutch.searcher.IndexSearcher; -import org.apache.nutch.searcher.Query; -import org.apache.nutch.searcher.HitDetails; -import org.apache.nutch.searcher.Hit; -import org.apache.nutch.searcher.Hits; -import org.apache.nutch.searcher.Summary; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.conf.Configuration; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.ArchiveParallelReader; -import org.apache.lucene.index.MultiReader; - -import org.apache.nutch.util.HadoopFSUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.indexer.FsDirectory; - -/** - * Utility class to use and extend the NutchBean class for reading - * from parallel indices. - * - * This can be used from the command-line to run test/debug searches, - * the same as NutchBean, but using parallel indices. - * - * NutchWaxBean doesn't extend NutchBean directly since all the good - * stuff inside of NutchBean is declared private. So, we dynamically - * modify a NutchBean instance via reflection to inject our own - * IndexReader that reads from a set of parallel indices. - * - * Before you recoil in horror over this approach, the alternatives - * were none too pretty. Sub-classing won't work since all the - * NutchBean data members are declared private. We could copy the - * NutchBean.java into our own source base and effectively over-write - * the Nutch version when we compile, but that is a maintenance - * headache of extreme magnitude. Plus, we'd probably have to - * copy/past/edit multiple Java source files. - * - * Ideally, Nutch would use some sort of dependency injection system, - * or at least make the NutchBean data members have public get/set - * methods (like a bean should). For now, doing dynamic injection via - * reflection seemed the least obtrusive. - */ -public class NutchWaxBean -{ - public static final Log LOG = LogFactory.getLog( NutchWaxBean.class ); - - /** - * Static utility class for modifying a NutchBean instance. - */ - public static class NutchBeanModifier - { - /** - * Modify the NutchBean by replacing the IndexReader in its - * IndexSearcher with one we create that uses - * ArchiveParallelReader for searching across parallel indices. - */ - public static boolean modify( NutchBean bean ) - { - try - { - LOG.info( "Modifying NutchBean with NutchWAX extensions..." ); - - // First, get the configuration from the bean. Gosh it would be - // nice if NutchBean had a getConf() public method, wouldn't it? - Field fConf = NutchBean.class.getDeclaredField( "conf" ); - fConf.setAccessible( true ); - - // The rest of this code is similar to NutchBean in that it - // looks for a 'pindexes' directory as a sibling of the - // 'indexes' directory that NutchBean finds. - Configuration conf = (Configuration) fConf.get( bean ); - - FileSystem fs = FileSystem.get( conf ); - - Path dir = new Path( conf.get( "searcher.dir", "crawl") ).makeQualified( fs ); - LOG.info( "Looking for Nutch indexes in: " + dir ); - if ( ! fs.exists( dir ) ) - { - LOG.warn( "Directory does not exist: " + dir ); - LOG.warn( "NutchBean not modified." ); - LOG.warn( "No Nutch indexes will be found and all queries will return no results." ); - - return false; - } - - Path indexesDir = new Path( dir, "pindexes" ).makeQualified(fs); - LOG.info( "Looking for NutchWax parallel indexes in: " + indexesDir ); - if ( ! fs.exists( indexesDir ) ) - { - LOG.warn( "Parallel indexes directory does not exist: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - if ( ! fs.getFileStatus( indexesDir ).isDir( ) ) - { - LOG.warn( "Parallel indexes directory is not a directory: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); - Path[] indexDirs = HadoopFSUtil.getPaths( fstats ); - - if ( indexDirs.length < 1 ) - { - LOG.info( "No sub-dirs found in parallel indexes directory: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); - - for ( Path indexDir : indexDirs ) - { - fstats = fs.listStatus( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); - Path parallelDirs[] = HadoopFSUtil.getPaths( fstats ); - - if ( parallelDirs.length < 1 ) - { - LOG.info( "No sub-directories, skipping: " + indexDir ); - - continue; - } - - ArchiveParallelReader reader = new ArchiveParallelReader( ); - - // Sort the parallelDirs so that we add them in order. Order - // matters to the ParallelReader. - Arrays.sort( parallelDirs ); - - for ( Path p : parallelDirs ) - { - LOG.info( "Adding reader for: " + p ); - reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); - } - - readers.add( reader ); - } - - if ( readers.size( ) == 0 ) - { - LOG.warn( "No parallel indexes in: " + indexesDir ); - LOG.warn( "NutchBean not modified." ); - - return false; - } - - MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); - - // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection. - Field fSearcher = NutchBean.class.getDeclaredField( "searcher" ); - Field fReader = IndexSearcher.class.getDeclaredField( "reader" ); - Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" ); - - fSearcher .setAccessible( true ); - fReader .setAccessible( true ); - fLuceneSearcher.setAccessible( true ); - - org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader ); - - IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean ); - fLuceneSearcher.set( searcher, newLuceneSearcher ); - fReader .set( searcher, reader ); - - return true; - } - catch ( Exception e ) - { - throw new RuntimeException( e ); - } - } - } - - /** - * Similar to code in NutchBean. This receives the events from the - * servlet container and modifies the NutchBean instance put there - * by the NutchBeanConstructor listener. For this to work, it must - * be declared after the NutchBeanConstructor in the web.xml file, - * e.g. - * <pre> - * <listener> - * <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> - * <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> - * </listener> - * </pre> - */ - public static class NutchWaxBeanConstructor implements ServletContextListener - { - - public void contextDestroyed( ServletContextEvent sce ) - { - } - - public void contextInitialized( ServletContextEvent sce ) - { - ServletContext app = sce.getServletContext(); - NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY ); - - if ( bean == null ) - { - LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); - - return ; - } - - // Modify the NutchBean. - NutchBeanModifier.modify( bean ); - } - - } - - /** - * Simple command-line driver akin to NutchBean.main that peforms - * the ben modification. Useful for testing and debugging from the - * command-line. - */ - public static void main(String[] args) throws Exception - { - String usage = "NutchWaxBean query"; - - if (args.length == 0) - { - System.err.println(usage); - System.exit(-1); - } - - Configuration conf = NutchConfiguration.create(); - - NutchBean bean = new NutchBean(conf); - NutchBeanModifier.modify( bean ); - - Query query = Query.parse(args[0], conf); - Hits hits = bean.search(query, 10); - System.out.println("Total hits: " + hits.getTotal()); - int length = (int)Math.min(hits.getTotal(), 10); - Hit[] show = hits.getHits(0, length); - HitDetails[] details = bean.getDetails(show); - Summary[] summaries = bean.getSummary(details, query); - - for (int i = 0; i < hits.getLength(); i++) - { - // Use a slightly more verbose output than NutchBean. - System.out.println( " " - + i - + " " - + java.util.Arrays.asList( details[i].getValues( "segment" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "url" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "orig" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "digest" ) ) - + " " - + java.util.Arrays.asList( details[i].getValues( "date" ) ) - + "\n" - + summaries[i] ); - } - } - - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |