[Archive-access-cvs] SF.net SVN: archive-access:[2863] trunk/archive-access/projects/nutchwax/ arc

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2863
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2863&view=rev
Author:   binzino
Date:     2009-10-28 22:03:37 +0000 (Wed, 28 Oct 2009)

Log Message:
-----------
Removed as all NutchWAX mods/edits have been moved into NutchBean in the Nutch source overlay.

Removed Paths:
-------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java

Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java	2009-10-28 21:55:11 UTC (rev 2862)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java	2009-10-28 22:03:37 UTC (rev 2863)
@@ -1,296 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.archive.nutchwax;
-
-//import java.io.*;
-import java.util.*;
-import java.lang.reflect.Field;
-import javax.servlet.ServletContext;
-import javax.servlet.ServletContextEvent;
-import javax.servlet.ServletContextListener;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.IndexSearcher;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.Summary;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.ArchiveParallelReader;
-import org.apache.lucene.index.MultiReader;
-
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.indexer.FsDirectory;
-
-/**
- * Utility class to use and extend the NutchBean class for reading
- * from parallel indices.
- *
- * This can be used from the command-line to run test/debug searches,
- * the same as NutchBean, but using parallel indices.
- *
- * NutchWaxBean doesn't extend NutchBean directly since all the good
- * stuff inside of NutchBean is declared private.  So, we dynamically
- * modify a NutchBean instance via reflection to inject our own
- * IndexReader that reads from a set of parallel indices.
- *
- * Before you recoil in horror over this approach, the alternatives
- * were none too pretty.  Sub-classing won't work since all the
- * NutchBean data members are declared private.  We could copy the
- * NutchBean.java into our own source base and effectively over-write
- * the Nutch version when we compile, but that is a maintenance
- * headache of extreme magnitude.  Plus, we'd probably have to
- * copy/past/edit multiple Java source files.
- *
- * Ideally, Nutch would use some sort of dependency injection system,
- * or at least make the NutchBean data members have public get/set
- * methods (like a bean should).  For now, doing dynamic injection via
- * reflection seemed the least obtrusive.
- */
-public class NutchWaxBean
-{
-  public static final Log LOG = LogFactory.getLog( NutchWaxBean.class );
-
-  /**
-   * Static utility class for modifying a NutchBean instance.
-   */
-  public static class NutchBeanModifier
-  {
-    /** 
-     * Modify the NutchBean by replacing the IndexReader in its
-     * IndexSearcher with one we create that uses
-     * ArchiveParallelReader for searching across parallel indices.
-     */
-    public static boolean modify( NutchBean bean )
-    {
-      try
-        {
-          LOG.info( "Modifying NutchBean with NutchWAX extensions..." );
-          
-          // First, get the configuration from the bean.  Gosh it would be
-          // nice if NutchBean had a getConf() public method, wouldn't it?
-          Field fConf = NutchBean.class.getDeclaredField( "conf" );
-          fConf.setAccessible( true );
-
-          // The rest of this code is similar to NutchBean in that it
-          // looks for a 'pindexes' directory as a sibling of the
-          // 'indexes' directory that NutchBean finds.
-          Configuration conf = (Configuration) fConf.get( bean );
-
-          FileSystem fs = FileSystem.get( conf );
-      
-          Path dir = new Path( conf.get( "searcher.dir", "crawl") ).makeQualified( fs );
-          LOG.info( "Looking for Nutch indexes in: " + dir );
-          if ( ! fs.exists( dir ) )
-            {
-              LOG.warn( "Directory does not exist: " + dir );
-              LOG.warn( "NutchBean not modified." );
-              LOG.warn( "No Nutch indexes will be found and all queries will return no results." );
-
-              return false;
-            }
-          
-          Path indexesDir = new Path( dir, "pindexes" ).makeQualified(fs);
-          LOG.info( "Looking for NutchWax parallel indexes in: " + indexesDir );
-          if ( ! fs.exists( indexesDir ) )
-            {
-              LOG.warn( "Parallel indexes directory does not exist: " + indexesDir );
-              LOG.warn( "NutchBean not modified." );
-
-              return false;
-            }
-
-          if ( ! fs.getFileStatus( indexesDir ).isDir( ) )
-            {
-              LOG.warn( "Parallel indexes directory is not a directory: " + indexesDir );
-              LOG.warn( "NutchBean not modified." );
-
-              return false;
-            }
-          
-          FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
-          Path[] indexDirs    = HadoopFSUtil.getPaths( fstats );
-
-          if ( indexDirs.length < 1 )
-            {
-              LOG.info( "No sub-dirs found in parallel indexes directory: " + indexesDir );
-              LOG.warn( "NutchBean not modified." );
-              
-              return false;
-            }
-      
-          List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length );
-      
-          for ( Path indexDir : indexDirs )
-            {
-              fstats = fs.listStatus( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) );
-              Path parallelDirs[] = HadoopFSUtil.getPaths( fstats );
-          
-              if ( parallelDirs.length < 1 )
-                {
-                  LOG.info( "No sub-directories, skipping: " + indexDir );
-
-                  continue;
-                }
-          
-              ArchiveParallelReader reader = new ArchiveParallelReader( );
-          
-              // Sort the parallelDirs so that we add them in order.  Order
-              // matters to the ParallelReader.
-              Arrays.sort( parallelDirs );
-          
-              for ( Path p : parallelDirs )
-                {
-                  LOG.info( "Adding reader for: " + p );
-                  reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); 
-                }
-          
-              readers.add( reader );
-            }
-
-          if ( readers.size( ) == 0 )
-            {
-              LOG.warn( "No parallel indexes in: " + indexesDir );
-              LOG.warn( "NutchBean not modified." );
-
-              return false;
-            }
-      
-          MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) );
-      
-          // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection.
-          Field fSearcher       = NutchBean.class.getDeclaredField( "searcher" );
-          Field fReader         = IndexSearcher.class.getDeclaredField( "reader" );
-          Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" );
-      
-          fSearcher      .setAccessible( true );
-          fReader        .setAccessible( true );
-          fLuceneSearcher.setAccessible( true );
-      
-          org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader );
-      
-          IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean );
-          fLuceneSearcher.set( searcher, newLuceneSearcher );
-          fReader        .set( searcher, reader );
-
-          return true;
-        }
-      catch ( Exception e )
-        {
-          throw new RuntimeException( e );
-        }
-    }
-  }
-
-  /**
-   * Similar to code in NutchBean.  This receives the events from the
-   * servlet container and modifies the NutchBean instance put there
-   * by the NutchBeanConstructor listener.  For this to work, it must
-   * be declared after the NutchBeanConstructor in the web.xml file,
-   * e.g.
-   * <pre>
-   * &lt;listener&gt;
-   *    &lt;listener-class&gt;org.apache.nutch.searcher.NutchBean$NutchBeanConstructor&lt;/listener-class&gt;
-   *    &lt;listener-class&gt;org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor&lt;/listener-class&gt;
-   * &lt;/listener&gt;
-   * </pre>
-   */
-  public static class NutchWaxBeanConstructor implements ServletContextListener 
-  {
-    
-    public void contextDestroyed( ServletContextEvent sce )
-    { 
-    }
-    
-    public void contextInitialized( ServletContextEvent sce ) 
-    {
-      ServletContext app = sce.getServletContext();
-      NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY );
-
-      if ( bean == null )
-        {
-          LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" );
-          
-          return ;
-        }
-
-      // Modify the NutchBean.
-      NutchBeanModifier.modify( bean );
-    }
-
-  }
-
-  /**
-   * Simple command-line driver akin to NutchBean.main that peforms
-   * the ben modification.  Useful for testing and debugging from the
-   * command-line.
-   */
-  public static void main(String[] args) throws Exception 
-  {
-    String usage = "NutchWaxBean query";
-
-    if (args.length == 0) 
-      {
-        System.err.println(usage);
-        System.exit(-1);
-      }
-    
-    Configuration conf = NutchConfiguration.create();
-
-    NutchBean bean = new NutchBean(conf);
-    NutchBeanModifier.modify( bean );
-
-    Query query = Query.parse(args[0], conf);
-    Hits hits = bean.search(query, 10);
-    System.out.println("Total hits: " + hits.getTotal());
-    int length = (int)Math.min(hits.getTotal(), 10);
-    Hit[] show = hits.getHits(0, length);
-    HitDetails[] details = bean.getDetails(show);
-    Summary[] summaries = bean.getSummary(details, query);
-    
-    for (int i = 0; i < hits.getLength(); i++) 
-      {
-        // Use a slightly more verbose output than NutchBean.
-        System.out.println( " " 
-                            + i 
-                            + " "
-                            + java.util.Arrays.asList( details[i].getValues( "segment" ) )
-                            + " " 
-                            + java.util.Arrays.asList( details[i].getValues( "url"     ) )
-                            + " " 
-                            + java.util.Arrays.asList( details[i].getValues( "orig"    ) )
-                            + " " 
-                            + java.util.Arrays.asList( details[i].getValues( "digest"  ) )
-                            + " " 
-                            + java.util.Arrays.asList( details[i].getValues( "date"    ) )
-                            + "\n" 
-                            + summaries[i] );
-      }
-  }
- 
-
-}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2863] trunk/archive-access/projects/nutchwax/ arc

[Archive-access-cvs] SF.net SVN: archive-access:[2863] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/NutchWaxBean.java