Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3168] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3168
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3168&view=rev
Author:   binzino
Date:     2010-07-11 00:09:27 +0000 (Sun, 11 Jul 2010)

Log Message:
-----------
A whole mess of accumulated hacks to get Importing and Indexing working with Hadoop 0.20 (Cloudera) on our Hadoop rack.

Modified Paths:
--------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java

Added Paths:
-----------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java

Removed Paths:
-------------
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSearch.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/DistributedSegmentBean.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/LuceneSearchBean.java
    tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/searcher/NutchBean.java

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java
===================================================================

--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/IndexMerger.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -34,9 +34,10 @@
 import org.apache.nutch.indexer.NutchSimilarity;
 import org.apache.nutch.indexer.FsDirectory;
 
-import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.NIOFSDirectory;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.index.ArchiveParallelReader;
 
 /*************************************************************************
@@ -84,10 +85,10 @@
     //
     // Merge indices
     //
-    IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
-    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+    IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( localOutput.toString() ) ), null, IndexWriter.MaxFieldLength.UNLIMITED );
+    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR));
     writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
-    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS));
     writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
     writer.setInfoStream(LogUtil.getDebugStream(LOG));
     writer.setUseCompoundFile(false);

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/Indexer.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -63,8 +63,8 @@
 
     FileOutputFormat.setOutputPath(job, luceneDir);
 
-    LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
-    LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
+    //LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
+    //LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
 
     NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class);
 

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/BuildIndex.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -20,13 +20,16 @@
  */
 package org.archive.nutchwax.tools;
 
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import java.io.*;
+
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.NIOFSDirectory;
 import org.apache.nutch.util.NutchConfiguration;
 
 
@@ -50,12 +53,12 @@
     String fieldValue = args[2].trim();
     int    count      = Integer.parseInt( args[3].trim() );
     
-    IndexWriter writer = new IndexWriter( indexDir, new WhitespaceAnalyzer( ), true );
+    IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( indexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED );
     
     for ( int i = 0 ; i < count ; i++ )
       {
         Document newDoc = new Document( );
-        newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.TOKENIZED ) );
+        newDoc.add( new Field( fieldKey, fieldValue, Field.Store.YES, Field.Index.ANALYZED ) );
 
         writer.addDocument( newDoc );
       }

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DateAdder.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -20,21 +20,15 @@
  */
 package org.archive.nutchwax.tools;
 
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
+import java.io.*;
+import java.util.*;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.store.NIOFSDirectory;
 
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.conf.Configuration;
@@ -104,15 +98,15 @@
 
       }
 
-    IndexReader reader = IndexReader.open( mainIndexDir );
+    IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( mainIndexDir ) ), true );
     
     IndexReader sourceReaders[] = new IndexReader[args.length-3];
     for ( int i = 0 ; i < sourceReaders.length ; i++ )
       {
-        sourceReaders[i] = IndexReader.open( args[i+1] );
+        sourceReaders[i] = IndexReader.open( new NIOFSDirectory( new File( args[i+1] ) ), true );
       }
 
-    IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true );
+    IndexWriter writer = new IndexWriter( new NIOFSDirectory( new File( destIndexDir ) ), null, IndexWriter.MaxFieldLength.UNLIMITED );
     
     UrlCanonicalizer canonicalizer = getCanonicalizer( this.getConf( ) );
 
@@ -134,7 +128,7 @@
           }
         for ( String date : uniqueDates )
           {
-            newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+            newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
           }
 
         // Obtain the new dates for the document.
@@ -162,7 +156,7 @@
           {
             for ( String date : newDates.split("\\s+") )
               {
-                newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) );
+                newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.NOT_ANALYZED ) );
               }
           }
 

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -27,7 +27,9 @@
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.ArchiveParallelReader;
+import org.apache.lucene.store.NIOFSDirectory;
 
+
 public class DumpParallelIndex
 {
   public static void main( String[] args ) throws Exception
@@ -58,7 +60,7 @@
     ArchiveParallelReader reader = new ArchiveParallelReader( );
     for ( String dir : dirs )
       {
-        reader.add( IndexReader.open( dir ) );
+        reader.add( IndexReader.open( new NIOFSDirectory( new File( dir ) ) ) );
       }
 
     if ( args[0].equals( "-l" ) )

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/GetUniqFieldValues.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -20,13 +20,11 @@
  */
 package org.archive.nutchwax.tools;
 
-import java.io.File;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.Collection;
+import java.io.*;
+import java.util.*;
 
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.NIOFSDirectory;
 
 /**
  * A quick-n-dirty command-line utility to get the unique values for a
@@ -55,7 +53,7 @@
   
   private static void dumpUniqValues( String fieldName, String indexDir ) throws Exception
   {
-    IndexReader reader = IndexReader.open(indexDir);
+    IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( indexDir) ) );
     
     Collection fieldNames = reader.getFieldNames( IndexReader.FieldOption.ALL );
 

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/java/org/archive/nutchwax/tools/LengthNormUpdater.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -16,15 +16,8 @@
  * limitations under the License.
  */
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.Collection;
-import java.util.HashSet;
+import java.io.*;
+import java.util.*;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.Term;
@@ -32,12 +25,11 @@
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Similarity;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.NIOFSDirectory;
 
-
 import org.apache.nutch.indexer.NutchSimilarity;
 
+
 /**
  * This is heavily cribbed from org.apache.lucene.misc.LengthNormModifier
  */
@@ -132,7 +124,7 @@
 
     String pagerankFile = args[pos++];
     
-    IndexReader reader = IndexReader.open( args[pos++] );
+    IndexReader reader = IndexReader.open( new NIOFSDirectory( new File( args[pos++] ) ) );
 
     try
       {

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/common-terms.utf8	2010-07-11 00:09:27 UTC (rev 3168)
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Common terms and phrases which will be indexed in n-grams
+# in order to optimize search.
+#content:a
+#content:and
+#content:for
+#content:in
+#content:of
+#content:the
+#content:to
+#url:com
+#url:http
+#url:http-www
+#url:www

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/conf/nutch-site.xml	2010-07-11 00:09:27 UTC (rev 3168)
@@ -10,7 +10,7 @@
   <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. -->
   <!-- Also, add 'parse-pdf' -->
   <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' -->
-  <value>protocol-http|parse-(text|html|js|pdf)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
+  <value>protocol-http|parse-(text|html|pdf|msword|mspowerpoint|oo)|index-nutchwax|query-(basic|nutchwax)|summary-basic|scoring-nutchwax|urlfilter-nutchwax</value>
 </property>
 
 <!-- 
@@ -42,6 +42,7 @@
           dest-key  = src-key
     -->
   <name>nutchwax.filter.index</name>
+<!--
   <value>
     title:false:true:tokenized
     content:false:compress:tokenized
@@ -55,6 +56,16 @@
     type:true:true:no_norms
     length:false:true:no
   </value>
+-->
+  <value>
+    title:false:true:tokenized
+    content:false:compress:tokenized
+    site:false:false:untokenized
+    url:false:true:tokenized
+    type:true:true:no_norms
+    date:false:true:no
+    length:false:true:no
+  </value>
 </property>
 
 <property>

Modified: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2010-07-10 23:34:25 UTC (rev 3167)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -1,3 +1,5 @@
+package org.apache.lucene.index;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -14,24 +16,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/** 
- * ARCHIVE: This must be in the lucene index package because it needs
- *          to call protected methods on other IndexReader objects.
- */
-package org.apache.lucene.index;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.document.FieldSelectorResult;
 import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositions;
-import org.apache.lucene.index.TermVectorMapper;
 
 import java.io.IOException;
 import java.util.*;
@@ -55,10 +44,12 @@
  * undefined behavior</em>.
  */
 public class ArchiveParallelReader extends IndexReader {
-  private List readers = new ArrayList();
-  private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close
+  private List<IndexReader> readers = new ArrayList<IndexReader>();
+  private List<Boolean> decrefOnClose = new ArrayList<Boolean>(); // remember which subreaders to decRef on close
   boolean incRefReaders = false;
-  private SortedMap fieldToReader = new TreeMap();
+  private SortedMap<String,IndexReader> fieldToReader = new TreeMap<String,IndexReader>();
+  private Map<IndexReader,Collection<String>> readerToFields = new HashMap<IndexReader,Collection<String>>();
+  private List<IndexReader> storedFieldReaders = new ArrayList<IndexReader>();
 
   private int maxDoc;
   private int numDocs;
@@ -81,9 +72,25 @@
  /** Add an IndexReader.
   * @throws IOException if there is a low-level IO error
   */
-  public void add(IndexReader reader) throws IOException
-  {
+  public void add(IndexReader reader) throws IOException {
     ensureOpen();
+    add(reader, false);
+  }
+
+ /** Add an IndexReader whose stored fields will not be returned.  This can
+  * accelerate search when stored fields are only needed from a subset of
+  * the IndexReaders.
+  *
+  * @throws IllegalArgumentException if not all indexes contain the same number
+  *     of documents
+  * @throws IllegalArgumentException if not all indexes have the same value
+  *     of {@link IndexReader#maxDoc()}
+  * @throws IOException if there is a low-level IO error
+  */
+  public void add(IndexReader reader, boolean ignoreStoredFields)
+    throws IOException {
+
+    ensureOpen();
     if (readers.size() == 0) {
       this.maxDoc = reader.maxDoc();
       this.numDocs = reader.numDocs();
@@ -97,14 +104,15 @@
       throw new IllegalArgumentException
         ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs());
 
-    Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL);
-    Iterator i = fields.iterator();
-    while (i.hasNext()) {                         // update fieldToReader map
-      String field = (String)i.next();
+    Collection<String> fields = reader.getFieldNames(IndexReader.FieldOption.ALL);
+    readerToFields.put(reader, fields);
+    for (final String field : fields) {                         // update fieldToReader map
       if (fieldToReader.get(field) == null)
         fieldToReader.put(field, reader);
     }
 
+    if (!ignoreStoredFields)
+      storedFieldReaders.add(reader);             // add to storedFieldReaders
     readers.add(reader);
     
     if (incRefReaders) {
@@ -112,7 +120,16 @@
     }
     decrefOnClose.add(Boolean.valueOf(incRefReaders));
   }
-
+  
+  @Override
+  public synchronized Object clone() {
+    try {
+      return doReopen(true);
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }
+  }
+  
   /**
    * Tries to reopen the subreaders.
    * <br>
@@ -132,63 +149,42 @@
    * @throws CorruptIndexException if the index is corrupt
    * @throws IOException if there is a low-level IO error 
    */
-  public IndexReader reopen() throws CorruptIndexException, IOException {
+  @Override
+  public synchronized IndexReader reopen() throws CorruptIndexException, IOException {
+    return doReopen(false);
+  }
+    
+  protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
     ensureOpen();
     
     boolean reopened = false;
-    List newReaders = new ArrayList();
-    List newDecrefOnClose = new ArrayList();
+    List<IndexReader> newReaders = new ArrayList<IndexReader>();
     
     boolean success = false;
     
     try {
-    
-      for (int i = 0; i < readers.size(); i++) {
-        IndexReader oldReader = (IndexReader) readers.get(i);
-        IndexReader newReader = oldReader.reopen();
+      for (final IndexReader oldReader : readers) {
+        IndexReader newReader = null;
+        if (doClone) {
+          newReader = (IndexReader) oldReader.clone();
+        } else {
+          newReader = oldReader.reopen();
+        }
         newReaders.add(newReader);
         // if at least one of the subreaders was updated we remember that
-        // and return a new MultiReader
+        // and return a new ArchiveParallelReader
         if (newReader != oldReader) {
           reopened = true;
         }
       }
-  
-      if (reopened) {
-        ArchiveParallelReader pr = new ArchiveParallelReader();
-        for (int i = 0; i < readers.size(); i++) {
-          IndexReader oldReader = (IndexReader) readers.get(i);
-          IndexReader newReader = (IndexReader) newReaders.get(i);
-          if (newReader == oldReader) {
-            newDecrefOnClose.add(Boolean.TRUE);
-            newReader.incRef();
-          } else {
-            // this is a new subreader instance, so on close() we don't
-            // decRef but close it 
-            newDecrefOnClose.add(Boolean.FALSE);
-          }
-          pr.add(newReader);
-        }
-        pr.decrefOnClose = newDecrefOnClose;
-        pr.incRefReaders = incRefReaders;
-        success = true;
-        return pr;
-      } else {
-        success = true; 
-       // No subreader was refreshed
-        return this;
-      }
+      success = true;
     } finally {
       if (!success && reopened) {
         for (int i = 0; i < newReaders.size(); i++) {
-          IndexReader r = (IndexReader) newReaders.get(i);
-          if (r != null) {
+          IndexReader r = newReaders.get(i);
+          if (r != readers.get(i)) {
             try {
-              if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) {
-                r.decRef();
-              } else {
-                r.close();
-              }
+              r.close();
             } catch (IOException ignore) {
               // keep going - we want to clean up as much as possible
             }
@@ -196,46 +192,74 @@
         }
       }
     }
+
+    if (reopened) {
+      List<Boolean> newDecrefOnClose = new ArrayList<Boolean>();
+      ArchiveParallelReader pr = new ArchiveParallelReader();
+      for (int i = 0; i < readers.size(); i++) {
+        IndexReader oldReader = readers.get(i);
+        IndexReader newReader = newReaders.get(i);
+        if (newReader == oldReader) {
+          newDecrefOnClose.add(Boolean.TRUE);
+          newReader.incRef();
+        } else {
+          // this is a new subreader instance, so on close() we don't
+          // decRef but close it 
+          newDecrefOnClose.add(Boolean.FALSE);
+        }
+        pr.add(newReader, !storedFieldReaders.contains(oldReader));
+      }
+      pr.decrefOnClose = newDecrefOnClose;
+      pr.incRefReaders = incRefReaders;
+      return pr;
+    } else {
+      // No subreader was refreshed
+      return this;
+    }
   }
 
 
+  @Override
   public int numDocs() {
     // Don't call ensureOpen() here (it could affect performance)
     return numDocs;
   }
 
+  @Override
   public int maxDoc() {
     // Don't call ensureOpen() here (it could affect performance)
     return maxDoc;
   }
 
+  @Override
   public boolean hasDeletions() {
     // Don't call ensureOpen() here (it could affect performance)
     return hasDeletions;
   }
 
   // check first reader
+  @Override
   public boolean isDeleted(int n) {
     // Don't call ensureOpen() here (it could affect performance)
     if (readers.size() > 0)
-      return ((IndexReader)readers.get(0)).isDeleted(n);
+      return readers.get(0).isDeleted(n);
     return false;
   }
 
   // delete in all readers
+  @Override
   protected void doDelete(int n) throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      ((IndexReader)readers.get(i)).deleteDocument(n);
+    for (final IndexReader reader : readers) {
+      reader.deleteDocument(n);
     }
     hasDeletions = true;
   }
 
-  /**
-   * @see org.apache.lucene.index.ParallelReader.doUndeleteAll
-   */
+  // undeleteAll in all readers
+  @Override
   protected void doUndeleteAll() throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      ((IndexReader)readers.get(i)).undeleteAll();
+    for (final IndexReader reader : readers) {
+      reader.undeleteAll();
     }
     hasDeletions = false;
   }
@@ -289,111 +313,150 @@
     return result;
   }
 
+  /*
+  // append fields from storedFieldReaders
+  @Override
+  public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
+    ensureOpen();
+    Document result = new Document();
+    for (final IndexReader reader: storedFieldReaders) {
+
+      boolean include = (fieldSelector==null);
+      if (!include) {
+        Collection<String> fields = readerToFields.get(reader);
+        for (final String field : fields)
+          if (fieldSelector.accept(field) != FieldSelectorResult.NO_LOAD) {
+            include = true;
+            break;
+          }
+      }
+      if (include) {
+        List<Fieldable> fields = reader.document(n, fieldSelector).getFields();
+        for (Fieldable field : fields) {
+          result.add(field);
+        }
+      }
+    }
+    return result;
+  }
+  */
+
   // get all vectors
+  @Override
   public TermFreqVector[] getTermFreqVectors(int n) throws IOException {
     ensureOpen();
-    ArrayList results = new ArrayList();
-    Iterator i = fieldToReader.entrySet().iterator();
-    while (i.hasNext()) {
-      Map.Entry e = (Map.Entry)i.next();
-      String field = (String)e.getKey();
-      IndexReader reader = (IndexReader)e.getValue();
+    ArrayList<TermFreqVector> results = new ArrayList<TermFreqVector>();
+    for (final Map.Entry<String,IndexReader> e: fieldToReader.entrySet()) {
+
+      String field = e.getKey();
+      IndexReader reader = e.getValue();
       TermFreqVector vector = reader.getTermFreqVector(n, field);
       if (vector != null)
         results.add(vector);
     }
-    return (TermFreqVector[])
-      results.toArray(new TermFreqVector[results.size()]);
+    return results.toArray(new TermFreqVector[results.size()]);
   }
 
+  @Override
   public TermFreqVector getTermFreqVector(int n, String field)
     throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     return reader==null ? null : reader.getTermFreqVector(n, field);
   }
 
 
+  @Override
   public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     if (reader != null) {
       reader.getTermFreqVector(docNumber, field, mapper); 
     }
   }
 
+  @Override
   public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
     ensureOpen();
-    ensureOpen();
 
-    Iterator i = fieldToReader.entrySet().iterator();
-    while (i.hasNext()) {
-      Map.Entry e = (Map.Entry)i.next();
-      String field = (String)e.getKey();
-      IndexReader reader = (IndexReader)e.getValue();
+    for (final Map.Entry<String,IndexReader> e : fieldToReader.entrySet()) {
+
+      String field = e.getKey();
+      IndexReader reader = e.getValue();
       reader.getTermFreqVector(docNumber, field, mapper);
     }
 
   }
 
+  @Override
   public boolean hasNorms(String field) throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     return reader==null ? false : reader.hasNorms(field);
   }
 
+  @Override
   public byte[] norms(String field) throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     return reader==null ? null : reader.norms(field);
   }
 
+  @Override
   public void norms(String field, byte[] result, int offset)
     throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     if (reader!=null)
       reader.norms(field, result, offset);
   }
 
+  @Override
   protected void doSetNorm(int n, String field, byte value)
     throws CorruptIndexException, IOException {
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    IndexReader reader = fieldToReader.get(field);
     if (reader!=null)
       reader.doSetNorm(n, field, value);
   }
 
+  @Override
   public TermEnum terms() throws IOException {
     ensureOpen();
     return new ParallelTermEnum();
   }
 
+  @Override
   public TermEnum terms(Term term) throws IOException {
     ensureOpen();
     return new ParallelTermEnum(term);
   }
 
+  @Override
   public int docFreq(Term term) throws IOException {
     ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+    IndexReader reader = fieldToReader.get(term.field());
     return reader==null ? 0 : reader.docFreq(term);
   }
 
+  @Override
   public TermDocs termDocs(Term term) throws IOException {
     ensureOpen();
     return new ParallelTermDocs(term);
   }
 
+  @Override
   public TermDocs termDocs() throws IOException {
     ensureOpen();
     return new ParallelTermDocs();
   }
 
+  @Override
   public TermPositions termPositions(Term term) throws IOException {
     ensureOpen();
     return new ParallelTermPositions(term);
   }
 
+  @Override
   public TermPositions termPositions() throws IOException {
     ensureOpen();
     return new ParallelTermPositions();
@@ -402,9 +465,10 @@
   /**
    * Checks recursively if all subreaders are up to date. 
    */
+  @Override
   public boolean isCurrent() throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      if (!((IndexReader)readers.get(i)).isCurrent()) {
+    for (final IndexReader reader : readers) {
+      if (!reader.isCurrent()) {
         return false;
       }
     }
@@ -416,9 +480,10 @@
   /**
    * Checks recursively if all subindexes are optimized 
    */
+  @Override
   public boolean isOptimized() {
-    for (int i = 0; i < readers.size(); i++) {
-      if (!((IndexReader)readers.get(i)).isOptimized()) {
+    for (final IndexReader reader : readers) {
+      if (!reader.isOptimized()) {
         return false;
       }
     }
@@ -431,36 +496,39 @@
   /** Not implemented.
    * @throws UnsupportedOperationException
    */
+  @Override
   public long getVersion() {
     throw new UnsupportedOperationException("ArchiveParallelReader does not support this method.");
   }
 
   // for testing
   IndexReader[] getSubReaders() {
-    return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]);
+    return readers.toArray(new IndexReader[readers.size()]);
   }
 
-  protected void doCommit() throws IOException {
-    for (int i = 0; i < readers.size(); i++)
-      ((IndexReader)readers.get(i)).commit();
+  @Override
+  protected void doCommit(Map<String,String> commitUserData) throws IOException {
+    for (final IndexReader reader : readers)
+      reader.commit(commitUserData);
   }
 
+  @Override
   protected synchronized void doClose() throws IOException {
     for (int i = 0; i < readers.size(); i++) {
-      if (((Boolean) decrefOnClose.get(i)).booleanValue()) {
-        ((IndexReader)readers.get(i)).decRef();
+      if (decrefOnClose.get(i).booleanValue()) {
+        readers.get(i).decRef();
       } else {
-        ((IndexReader)readers.get(i)).close();
+        readers.get(i).close();
       }
     }
   }
 
-  public Collection getFieldNames (IndexReader.FieldOption fieldNames) {
+  @Override
+  public Collection<String> getFieldNames (IndexReader.FieldOption fieldNames) {
     ensureOpen();
-    Set fieldSet = new HashSet();
-    for (int i = 0; i < readers.size(); i++) {
-      IndexReader reader = ((IndexReader)readers.get(i));
-      Collection names = reader.getFieldNames(fieldNames);
+    Set<String> fieldSet = new HashSet<String>();
+    for (final IndexReader reader : readers) {
+      Collection<String> names = reader.getFieldNames(fieldNames);
       fieldSet.addAll(names);
     }
     return fieldSet;
@@ -468,24 +536,28 @@
 
   private class ParallelTermEnum extends TermEnum {
     private String field;
-    private Iterator fieldIterator;
+    private Iterator<String> fieldIterator;
     private TermEnum termEnum;
 
     public ParallelTermEnum() throws IOException {
-      if ( fieldToReader.isEmpty( ) ) return ;
-
-      field = (String)fieldToReader.firstKey();
+      try {
+        field = fieldToReader.firstKey();
+      } catch(NoSuchElementException e) {
+        // No fields, so keep field == null, termEnum == null
+        return;
+      }
       if (field != null)
-        termEnum = ((IndexReader)fieldToReader.get(field)).terms();
+        termEnum = fieldToReader.get(field).terms();
     }
 
     public ParallelTermEnum(Term term) throws IOException {
       field = term.field();
-      IndexReader reader = ((IndexReader)fieldToReader.get(field));
+      IndexReader reader = fieldToReader.get(field);
       if (reader!=null)
         termEnum = reader.terms(term);
     }
 
+    @Override
     public boolean next() throws IOException {
       if (termEnum==null)
         return false;
@@ -502,8 +574,8 @@
         fieldIterator.next();                     // Skip field to get next one
       }
       while (fieldIterator.hasNext()) {
-        field = (String) fieldIterator.next();
-        termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, ""));
+        field = fieldIterator.next();
+        termEnum = fieldToReader.get(field).terms(new Term(field));
         Term term = termEnum.term();
         if (term!=null && term.field()==field)
           return true;
@@ -514,6 +586,7 @@
       return false;                               // no more fields
     }
 
+    @Override
     public Term term() {
       if (termEnum==null)
         return null;
@@ -521,6 +594,7 @@
       return termEnum.term();
     }
 
+    @Override
     public int docFreq() {
       if (termEnum==null)
         return 0;
@@ -528,6 +602,7 @@
       return termEnum.docFreq();
     }
 
+    @Override
     public void close() throws IOException {
       if (termEnum!=null)
         termEnum.close();
@@ -540,13 +615,18 @@
     protected TermDocs termDocs;
 
     public ParallelTermDocs() {}
-    public ParallelTermDocs(Term term) throws IOException { seek(term); }
+    public ParallelTermDocs(Term term) throws IOException {
+      if (term == null)
+        termDocs = readers.isEmpty() ? null : readers.get(0).termDocs(null);
+      else
+        seek(term);
+    }
 
     public int doc() { return termDocs.doc(); }
     public int freq() { return termDocs.freq(); }
 
     public void seek(Term term) throws IOException {
-      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      IndexReader reader = fieldToReader.get(term.field());
       termDocs = reader!=null ? reader.termDocs(term) : null;
     }
 
@@ -588,8 +668,9 @@
     public ParallelTermPositions() {}
     public ParallelTermPositions(Term term) throws IOException { seek(term); }
 
+    @Override
     public void seek(Term term) throws IOException {
-      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      IndexReader reader = fieldToReader.get(term.field());
       termDocs = reader!=null ? reader.termPositions(term) : null;
     }
 
@@ -614,3 +695,8 @@
   }
 
 }
+
+
+
+
+

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.lucene;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.util.zip.GZIPOutputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.CompressionTools;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.analysis.AnalyzerFactory;
+import org.apache.nutch.analysis.NutchAnalyzer;
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchIndexWriter;
+import org.apache.nutch.indexer.NutchSimilarity;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.LogUtil;
+
+public class LuceneWriter implements NutchIndexWriter {
+
+  public static enum STORE { YES, NO, COMPRESS }
+
+  public static enum INDEX { NO, NO_NORMS, TOKENIZED, UNTOKENIZED }
+
+  public static enum VECTOR { NO, OFFSET, POS, POS_OFFSET, YES }
+
+  private IndexWriter writer;
+
+  private AnalyzerFactory analyzerFactory;
+
+  private Path perm;
+
+  private Path temp;
+
+  private FileSystem fs;
+
+  private final Map<String, Field.Store> fieldStore;
+  private final Set<String>              fieldCompress;
+
+  private final Map<String, Field.Index> fieldIndex;
+
+  private final Map<String, Field.TermVector> fieldVector;
+
+  public LuceneWriter() {
+    fieldStore = new HashMap<String, Field.Store>();
+    fieldCompress = new HashSet<String>();
+    fieldIndex = new HashMap<String, Field.Index>();
+    fieldVector = new HashMap<String, Field.TermVector>();
+  }
+
+  private Document createLuceneDoc(NutchDocument doc) {
+    final Document out = new Document();
+
+    out.setBoost(doc.getScore());
+
+    final Metadata documentMeta = doc.getDocumentMeta();
+    for (final Entry<String, List<String>> entry : doc) {
+      final String fieldName = entry.getKey();
+
+      Field.Store store = fieldStore.get(fieldName);
+      boolean compress  = fieldCompress.contains(fieldName);
+      Field.Index index = fieldIndex.get(fieldName);
+      Field.TermVector vector = fieldVector.get(fieldName);
+
+      // default values
+      if (store == null) {
+        store = Field.Store.NO;
+      }
+
+      if (index == null) {
+        index = Field.Index.NO;
+      }
+
+      if (vector == null) {
+        vector = Field.TermVector.NO;
+      }
+
+      // read document-level field information
+      final String[] fieldMetas =
+        documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
+      if (fieldMetas.length != 0) {
+        for (final String val : fieldMetas) {
+          System.out.println( fieldName + " : " + val );
+          if (LuceneConstants.STORE_YES.equals(val)) {
+            store = Field.Store.YES;
+          } else if (LuceneConstants.STORE_NO.equals(val)) {
+            store = Field.Store.NO;
+          } else if (LuceneConstants.STORE_COMPRESS.equals(val)) {
+            compress = true;
+          } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) {
+            index = Field.Index.ANALYZED;
+          } else if (LuceneConstants.INDEX_NO.equals(val)) {
+            index = Field.Index.NO;
+          } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) {
+            index = Field.Index.NOT_ANALYZED;
+          } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) {
+            index = Field.Index.ANALYZED_NO_NORMS;
+          } else if (LuceneConstants.VECTOR_NO.equals(val)) {
+            vector = Field.TermVector.NO;
+          } else if (LuceneConstants.VECTOR_YES.equals(val)) {
+            vector = Field.TermVector.YES;
+          } else if (LuceneConstants.VECTOR_POS.equals(val)) {
+            vector = Field.TermVector.WITH_POSITIONS;
+          } else if (LuceneConstants.VECTOR_POS_OFFSET.equals(val)) {
+            vector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+          } else if (LuceneConstants.VECTOR_OFFSET.equals(val)) {
+            vector = Field.TermVector.WITH_OFFSETS;
+          }
+        }
+      }
+
+      for (final String fieldValue : entry.getValue()) {
+        if ( compress )
+          {
+            out.add( new Field( fieldName, CompressionTools.compressString( fieldValue ), Field.Store.YES ) );
+          }
+        out.add(new Field(fieldName, fieldValue, store, index, vector));
+      }
+    }
+
+    return out;
+  }
+
+  @SuppressWarnings("unchecked")
+  private void processOptions(Configuration conf) {
+    final Iterator iterator = conf.iterator();
+    while (iterator.hasNext()) {
+      final String key = (String) ((Map.Entry)iterator.next()).getKey();
+      if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) {
+        continue;
+      }
+      if (key.startsWith(LuceneConstants.FIELD_STORE_PREFIX)) {
+        final String field =
+          key.substring(LuceneConstants.FIELD_STORE_PREFIX.length());
+        final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key));
+        switch (store) {
+        case YES:
+          fieldStore.put(field, Field.Store.YES);
+          break;
+        case NO:
+          fieldStore.put(field, Field.Store.NO);
+          break;
+        case COMPRESS:
+          fieldCompress.add(field);
+          break;
+        }
+      } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) {
+        final String field =
+          key.substring(LuceneConstants.FIELD_INDEX_PREFIX.length());
+        final LuceneWriter.INDEX index = LuceneWriter.INDEX.valueOf(conf.get(key));
+        switch (index) {
+        case NO:
+          fieldIndex.put(field, Field.Index.NO);
+          break;
+        case NO_NORMS:
+          fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS);
+          break;
+        case TOKENIZED:
+          fieldIndex.put(field, Field.Index.ANALYZED);
+          break;
+        case UNTOKENIZED:
+          fieldIndex.put(field, Field.Index.NOT_ANALYZED);
+          break;
+        }
+      } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) {
+        final String field =
+          key.substring(LuceneConstants.FIELD_VECTOR_PREFIX.length());
+        final LuceneWriter.VECTOR vector = LuceneWriter.VECTOR.valueOf(conf.get(key));
+        switch (vector) {
+        case NO:
+          fieldVector.put(field, Field.TermVector.NO);
+          break;
+        case OFFSET:
+          fieldVector.put(field, Field.TermVector.WITH_OFFSETS);
+          break;
+        case POS:
+          fieldVector.put(field, Field.TermVector.WITH_POSITIONS);
+          break;
+        case POS_OFFSET:
+          fieldVector.put(field, Field.TermVector.WITH_POSITIONS_OFFSETS);
+          break;
+        case YES:
+          fieldVector.put(field, Field.TermVector.YES);
+          break;
+        }
+      }
+    }
+  }
+
+  public void open(JobConf job, String name)
+  throws IOException {
+    this.fs = FileSystem.get(job);
+    perm = new Path(FileOutputFormat.getOutputPath(job), name);
+    temp = job.getLocalPath("index/_"  +
+                      Integer.toString(new Random().nextInt()));
+
+    fs.delete(perm, true); // delete old, if any
+    analyzerFactory = new AnalyzerFactory(job);
+    writer = new IndexWriter(
+        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
+        new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);
+
+    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
+    writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
+    writer.setMaxMergeDocs(job
+        .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
+    writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
+    writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
+    writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG));
+    writer.setUseCompoundFile(false);
+    writer.setSimilarity(new NutchSimilarity());
+
+    processOptions(job);
+  }
+
+  public void close() throws IOException {
+    writer.optimize();
+    writer.close();
+    fs.completeLocalOutput(perm, temp); // copy to dfs
+    fs.createNewFile(new Path(perm, Indexer.DONE_NAME));
+  }
+
+  public void write(NutchDocument doc) throws IOException {
+    final Document luceneDoc = createLuceneDoc(doc);
+    final NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang"));
+    if (Indexer.LOG.isDebugEnabled()) {
+      Indexer.LOG.debug("Indexing [" + luceneDoc.get("url")
+          + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang")
+          + ")");
+    }
+    writer.addDocument(luceneDoc, analyzer);
+
+  }
+
+  /** Adds a lucene field.
+   * <p>
+   * This method is provided for backward-compatibility with
+   * older indexing filters. This should not be used by newer
+   * implementations since this is slower than
+   * {@link NutchDocument#add(String, String)} and will be removed
+   * in a future release.
+   * </p>
+   * @param f Lucene field to be added.
+   * @deprecated Use {@link NutchDocument#add(String, String)} instead and
+   * set index-level metadata for field information.
+   * */
+  @Deprecated
+  public static void add(NutchDocument doc, Field f) {
+    final String fieldName = f.name();
+    final String key = LuceneConstants.FIELD_PREFIX + fieldName;
+    final Metadata documentMeta = doc.getDocumentMeta();
+    if (f.isStored()) {
+      documentMeta.add(key, LuceneConstants.STORE_YES);
+    } else {
+      documentMeta.add(key, LuceneConstants.STORE_NO);
+    }
+
+    if (f.isIndexed()) {
+      if (f.isTokenized()) {
+        documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
+      } else if (f.getOmitNorms()) {
+        documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS);
+      } else {
+        documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
+      }
+    } else {
+      documentMeta.add(key, LuceneConstants.INDEX_NO);
+    }
+
+    if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) {
+      documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET);
+    } else if (f.isStoreOffsetWithTermVector()) {
+      documentMeta.add(key, LuceneConstants.VECTOR_OFFSET);
+    } else if (f.isStorePositionWithTermVector()) {
+      documentMeta.add(key, LuceneConstants.VECTOR_POS);
+    } else if (f.isTermVectorStored()) {
+      documentMeta.add(key, LuceneConstants.VECTOR_YES);
+    } else {
+      documentMeta.add(key, LuceneConstants.VECTOR_NO);
+    }
+  }
+
+  public static void addFieldOptions(String field, LuceneWriter.STORE store,
+      LuceneWriter.INDEX index, LuceneWriter.VECTOR vector, Configuration conf) {
+
+    conf.set(LuceneConstants.FIELD_STORE_PREFIX + field, store.toString());
+    conf.set(LuceneConstants.FIELD_INDEX_PREFIX + field, index.toString());
+    conf.set(LuceneConstants.FIELD_VECTOR_PREFIX + field, vector.toString());
+  }
+
+  public static void addFieldOptions(String field, LuceneWriter.STORE store,
+      LuceneWriter.INDEX index, Configuration conf) {
+    LuceneWriter.addFieldOptions(field, store, index, LuceneWriter.VECTOR.NO, conf);
+  }
+}

Added: tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java
===================================================================
--- tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java	                        (rev 0)
+++ tags/nutchwax-0_13-JIRA-WAX-75/archive/src/nutch/src/java/org/apache/nutch/plugin/PluginManifestParser.java	2010-07-11 00:09:27 UTC (rev 3168)
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+
+import org.apache.hadoop.conf.Configuration;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>PluginManifestParser</code> parser just parse the manifest file
+ * in all plugin directories.
+ * 
+ * @author joa23
+ */
+public class PluginManifestParser {
+  private static final String ATTR_NAME = "name";
+  private static final String ATTR_CLASS = "class";
+  private static final String ATTR_ID = "id";
+
+  public static final Log LOG = PluginRepository.LOG;
+
+  private static final boolean WINDOWS = System.getProperty("os.name")
+      .startsWith("Windows");
+
+  private Configuration conf;
+
+  private PluginRepository pluginRepository;
+
+  public PluginManifestParser(Configuration conf,
+      PluginRepository pluginRepository) {
+    this.conf = conf;
+    this.pluginRepository = pluginRepository;
+  }
+
+  /**
+   * Returns a list of all found plugin descriptors.
+   * 
+   * @param pluginFolders
+   *          folders to search plugins from
+   * @return A {@link Map} of all found {@link PluginDescriptor}s.
+   */
+  public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) {
+    Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>();
+
+    if (pluginFolders == null) {
+      throw new IllegalArgumentException("plugin.folders is not defined");
+    }
+
+    for (String name : pluginFolders) {
+      File directory = getPluginFolder(name);
+      if (directory == null) {
+        continue;
+      }
+      LOG.info("Plugins: looking in: " + directory.getAbsolutePath());
+      for (File oneSubFolder : directory.listFiles()) {
+        if (oneSubFolder.isDirectory()) {
+          String manifestPath = oneSubFolder.getAbsolutePath() + File.separator
+              + "plugin.xml";
+          try {
+            LOG.debug("parsing: " + manifestPath);
+            PluginDescriptor p = parseManifestFile(manifestPath);
+            map.put(p.getPluginId(), p);
+          } catch (MalformedURLException e) {
+            LOG.warn(e.toString());
+          } catch (SAXException e) {
+            LOG.warn(e.toString());
+          } catch (IOException e) {
+            LOG.warn(e.toString());
+          } catch (ParserConfigurationException e) {
+            LOG.warn(e.toString());
+          }
+        }
+      }
+    }
+    return map;
+  }
+
+  /**
+   * Return the named plugin folder. If the name is absolute then it is
+   * returned. Otherwise, for relative names, the classpath is scanned.
+   */
+  public File getPluginFolder(String name) {
+    File directory = new File(name);
+    if (!directory.isAbsolute()) {
+      URL url = PluginManifestParser.class.getClassLoader().getResource(name);
+      if (url == null && directory.exists() && directory.isDirectory()
+          && directory.listFiles().length > 0) {
+        return directory; // relative path that is not in the classpath
+      } else if (url == null) {
+        LOG.warn("Plugins: directory not found: " + name);
+        return null;
+      } else if ( "jar".equals(url.getProtocol()) ) {
+        try
+          {
+            // HACK to find directory containing .jar file and look for plugins there.
+            LOG.warn( "HACK to look for plugin directory next to jar file: " + url );
+            java.net.JarURLConnection connection = (java.net.JarURLConnection) url.openConnection();
+            URL url2 = connection.getJarFileURL();
+            if ( !"file".equals(url2.getProtocol()) )
+              {
+                LOG.warn( "Jar file is not a file: " + url2 );
+                return  null;
+              }
+            directory = new File( new File( url2.getFile() ).getParent( ) + "/" + name );
+            LOG.warn( "Plugin directory: " + directory );
+            return directory;
+          }
+        catch ( IOException ioe )
+          {
+            LOG.warn( ioe );
+            return null;
+          }
+      } else if (!"file".equals(url.getProtocol())) {
+        LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url);
+        return null;
+      }
+      String path = url.getPath();
+      if (WINDOWS && path.startsWith("/")) // patch a windows bug
+        path = path.substring(1);
+      try {
+        path = URLDecoder.decode(path, "UTF-8"); // decode the url path
+      } catch (UnsupportedEncodingException e) {
+      }
+      directory = new File(path);
+    }
+    return directory;
+  }
+
+  /**
+   * @param manifestPath
+   * @throws ParserConfigurationException
+   * @throws IOException
+   * @throws SAXException
+   * @throws MalformedURLException
+   */
+  private PluginDescriptor parseManifestFile(String pManifestPath)
+      throws MalformedURLException, SAXException, IOException,
+      ParserConfigurationException {
+    Document document = parseXML(new File(pManifestPath).toURL());
+    String pPath = new File(pManifestPath).getParent();
+    return parsePlugin(document, pPath);
+  }
+
+  /**
+   * @param url
+   * @return Document
+   * @throws IOException
+   * @throws SAXException
+   * @throws ParserConfigurationException
+   * @throws DocumentException
+   */
+  private Document parseXML(URL url) throws SAXException, IOException,
+      ParserConfigurationException {
+    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+    DocumentBuilder builder = factory.newDocumentBuilder();
+    return builder.parse(url.openStream());
+  }
+
+  /**
+   * @param pDocument
+   * @throws MalformedURLException
+   */
+  private PluginDescriptor parsePlugin(Document pDocument, String pPath)
+      throws MalformedURLException {
+    Element rootElement = pDocument.getDocumentElement();
+    String id = rootElement.getAttribute(ATTR_ID);
+    String name = rootElement.getAttribute(ATTR_NAME);
+    String version = rootElement.getAttribute("version");
+    String providerName = rootElement.getAttribute("provider-name");
+    String pluginClazz = null;
+    if (rootElement.getAttribute(ATTR_CLASS).trim().length() > 0) {
+      pluginClazz = rootElement.getAttribute(ATTR_CLASS);
+    }
+    PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
+        providerName, pluginClazz, pPath, this.conf);
+    LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
+          + " provider=" + providerName + "class=" + pluginClazz);
+    parseExtension(rootElement, pluginDescriptor);
+    parseExtensionPoints(rootElement, pluginDescriptor);
+    parseLibraries(rootElement, pluginDescriptor);
+    parseRequires(rootElement, pluginDescriptor);
+    return pluginDescriptor;
+  }
+
+  /**
+   * @param pRootElement
+   * @param pDescriptor
+   * @throws MalformedURLException
+   */
+  private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor)
+      throws MalformedURLException {
+
+    NodeList nodelist = pRootElement.getElementsByTagName("requires");
+    if (nodelist.getLength() > 0) {
+
+      Element requires = (Element) nodelist.item(0);
+
+      NodeList imports = requires.getElementsByTagName("import");
+      for (int i = 0; i < imports.getLength(); i++) {
+        Element anImport = (Element) imports.item(i);
+        String plugin = anImport.getAttribute("plugin");
+        if (plugin != null) {
+          pDescriptor.addDependency(plugin);
+        }
+      }
+    }
+  }
+
+  /**
+   * @param pRootElement
+   * @param pDescriptor
+   * @throws MalformedURLException
+   */
+  private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor)
+      throws MalformedURLException {
+    NodeList nodelist = pRootElement.getElementsByTagName("runtime");
+    if (nodelist.getLength() > 0) {
+
+      Element runtime = (Element) nodelist.item(0);
+
+      NodeList libraries = runtime.getElementsByTagName("library");
+      for (int i = 0; i < libraries.getLength(); i++)...
 
[truncated message content]

Thread: [Archive-access-cvs] SF.net SVN: archive-access:[3168] tags/nutchwax-0_13-JIRA-WAX-75/archive/ src

archive-access-cvs