[Archive-access-cvs] SF.net SVN: archive-access: [2336] trunk/archive-access/projects/nutchwax/ ar

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2336
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2336&view=rev
Author:   binzino
Date:     2008-06-26 17:31:59 -0700 (Thu, 26 Jun 2008)

Log Message:
-----------
Initial revision.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java	2008-06-27 00:31:59 UTC (rev 2336)
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.archive.nutchwax;
+
+import java.io.*;
+import java.util.*;
+import java.lang.reflect.Field;
+import javax.servlet.*;
+
+import org.apache.nutch.searcher.NutchBean;
+import org.apache.nutch.searcher.IndexSearcher;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.HitDetails;
+import org.apache.nutch.searcher.Hit;
+import org.apache.nutch.searcher.Hits;
+import org.apache.nutch.searcher.Summary;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Closeable;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.ArchiveParallelReader;
+import org.apache.lucene.index.MultiReader;
+
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.indexer.FsDirectory;
+
+/**
+ * Utility class to use and extend the NutchBean class for reading
+ * from parallel indices.
+ *
+ * This can be used from the command-line to run test/debug searches,
+ * the same as NutchBean, but using parallel indices.
+ *
+ * NutchWaxBean doesn't extend NutchBean directly since all the good
+ * stuff inside of NutchBean is declared private.  So, we dynamically
+ * modify a NutchBean instance via reflection to inject our own
+ * IndexReader that reads from a set of parallel indices.
+ *
+ * Before you recoil in horror over this approach, the alternatives
+ * were none too pretty.  Sub-classing won't work since all the
+ * NutchBean data members are declared private.  We could copy the
+ * NutchBean.java into our own source base and effectively over-write
+ * the Nutch version when we compile, but that is a maintenance
+ * headache of extreme magnitude.  Plus, we'd probably have to
+ * copy/past/edit multiple Java source files.
+ *
+ * Ideally, Nutch would use some sort of dependency injection system,
+ * or at least make the NutchBean data members have public get/set
+ * methods (like a bean should).  For now, doing dynamic injection via
+ * reflection seemed the least obtrusive.
+ */
+public class NutchWaxBean
+{
+
+  /**
+   * Static utility class for modifying a NutchBean instance.
+   */
+  public static class NutchBeanModifier
+  {
+    /** 
+     * Modify the NutchBean by replacing the IndexReader in its
+     * IndexSearcher with one we create that uses
+     * ArchiveParallelReader for searching across parallel indices.
+     */
+    public static void modify( NutchBean bean )
+    {
+      try
+        {
+          // First, get the configuration from the bean.  Gosh it would be
+          // nice if NutchBean had a getConf() public method, wouldn't it?
+          Field fConf = NutchBean.class.getDeclaredField( "conf" );
+          fConf.setAccessible( true );
+
+          // The rest of this code is similar to NutchBean in that it
+          // looks for a 'pindexes' directory as a sibling of the
+          // 'indexes' directory that NutchBean finds.
+          Configuration conf = (Configuration) fConf.get( bean );
+
+          FileSystem fs = FileSystem.get( conf );
+      
+          Path dir = new Path( conf.get( "searcher.dir", "crawl") );
+      
+          Path indexesDir = new Path( dir, "pindexes" );
+      
+          Path indexDirs[] = fs.listPaths(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+      
+          List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length );
+      
+          for ( Path indexDir : indexDirs )
+            {
+              Path parallelDirs[] = fs.listPaths( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) );
+          
+              if ( parallelDirs.length < 1 )
+                {
+                  continue;
+                }
+          
+              ArchiveParallelReader reader = new ArchiveParallelReader( );
+          
+              // Sort the parallelDirs so that we add them in order.  Order
+              // matters to the ParallelReader.
+              Arrays.sort( parallelDirs );
+          
+              for ( Path p : parallelDirs )
+                {
+                  reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); 
+                }
+          
+              readers.add( reader );
+            }
+      
+          MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) );
+      
+          // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection.
+          Field fSearcher       = NutchBean.class.getDeclaredField( "searcher" );
+          Field fReader         = IndexSearcher.class.getDeclaredField( "reader" );
+          Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" );
+      
+          fSearcher      .setAccessible( true );
+          fReader        .setAccessible( true );
+          fLuceneSearcher.setAccessible( true );
+      
+          org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader );
+      
+          IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean );
+          fLuceneSearcher.set( searcher, newLuceneSearcher );
+          fReader        .set( searcher, reader );
+        }
+      catch ( Exception e )
+        {
+          throw new RuntimeException( e );
+        }
+    }
+  }
+
+  /**
+   * Similar to code in NutchBean.  This receives the events from the
+   * servlet container and modifies the NutchBean instance put there
+   * by the NutchBeanConstructor listener.  For this to work, it must
+   * be declared after the NutchBeanConstructor in the web.xml file,
+   * e.g.
+   * <pre>
+   * &lt;listener&gt;
+   *    &lt;listener-class&gt;org.apache.nutch.searcher.NutchBean$NutchBeanConstructor&lt;/listener-class&gt;
+   *    &lt;listener-class&gt;org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor&lt;/listener-class&gt;
+   * &lt;/listener&gt;
+   * </pre>
+   */
+  public static class NutchWaxBeanConstructor implements ServletContextListener 
+  {
+    
+    public void contextDestroyed( ServletContextEvent sce )
+    { 
+    }
+    
+    public void contextInitialized( ServletContextEvent sce ) 
+    {
+      ServletContext app = sce.getServletContext();
+      NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY );
+
+      if ( bean == null )
+        {
+          NutchBean.LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" );
+          
+          return ;
+        }
+
+      // Modify the NutchBean.
+      NutchBeanModifier.modify( bean );
+    }
+
+  }
+
+  /**
+   * Simple command-line driver akin to NutchBean.main that peforms
+   * the ben modification.  Useful for testing and debugging from the
+   * command-line.
+   */
+  public static void main(String[] args) throws Exception 
+  {
+    String usage = "NutchWaxBean query";
+
+    if (args.length == 0) 
+      {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+    
+    Configuration conf = NutchConfiguration.create();
+
+    NutchBean bean = new NutchBean(conf);
+    NutchBeanModifier.modify( bean );
+
+    Query query = Query.parse(args[0], conf);
+    Hits hits = bean.search(query, 10);
+    System.out.println("Total hits: " + hits.getTotal());
+    int length = (int)Math.min(hits.getTotal(), 10);
+    Hit[] show = hits.getHits(0, length);
+    HitDetails[] details = bean.getDetails(show);
+    Summary[] summaries = bean.getSummary(details, query);
+    
+    for (int i = 0; i < hits.getLength(); i++) 
+      {
+        // Use a slightly more verbose output than NutchBean.
+        System.out.println( " " 
+                            + i 
+                            + " "
+                            + details[i].getValue( "segment" ) 
+                            + " " 
+                            + details[i].getValue( "url") 
+                            + " " 
+                            + details[i].getValue( "archive-digest") 
+                            + " " 
+                            + java.util.Arrays.asList( details[i].getValues( "date") )
+                            + "\n" 
+                            + summaries[i] );
+      }
+  }
+ 
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2336] trunk/archive-access/projects/nutchwax/ ar

[Archive-access-cvs] SF.net SVN: archive-access: [2336] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/NutchWaxBean.java