Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2327] trunk/archive-access/projects/nutchwax/ arc

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2327
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2327&view=rev
Author:   binzino
Date:     2008-06-26 15:30:24 -0700 (Thu, 26 Jun 2008)

Log Message:
-----------
Initial revision of modified version of Lucene's ParallalReader.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/
    trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/
    trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/
    trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java

Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2008-06-26 22:30:24 UTC (rev 2327)
@@ -0,0 +1,614 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** 
+ * ARCHIVE: This must be in the lucene index package because it needs
+ *          to call protected methods on other IndexReader objects.
+ */
+package org.apache.lucene.index;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorMapper;
+
+import java.io.IOException;
+import java.util.*;
+
+
+/** An IndexReader which reads multiple, parallel indexes.  Each index added
+ * must have the same number of documents, but typically each contains
+ * different fields.  Each document contains the union of the fields of all
+ * documents with the same document number.  When searching, matches for a
+ * query term are from the first index added that has the field.
+ *
+ * <p>This is useful, e.g., with collections that have large fields which
+ * change rarely and small fields that change more frequently.  The smaller
+ * fields may be re-indexed in a new index and both indexes may be searched
+ * together.
+ *
+ * <p><strong>Warning:</strong> It is up to you to make sure all indexes
+ * are created and modified the same way. For example, if you add
+ * documents to one index, you need to add the same documents in the
+ * same order to the other indexes. <em>Failure to do so will result in
+ * undefined behavior</em>.
+ */
+public class ArchiveParallelReader extends IndexReader {
+  private List readers = new ArrayList();
+  private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close
+  boolean incRefReaders = false;
+  private SortedMap fieldToReader = new TreeMap();
+
+  private int maxDoc;
+  private int numDocs;
+  private boolean hasDeletions;
+
+ /** Construct a ArchiveParallelReader. 
+  * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p>
+  */
+  public ArchiveParallelReader() throws IOException { this(true); }
+   
+ /** Construct a ArchiveParallelReader. 
+  * @param closeSubReaders indicates whether the subreaders should be closed
+  * when this ArchiveParallelReader is closed
+  */
+  public ArchiveParallelReader(boolean closeSubReaders) throws IOException {
+    super();
+    this.incRefReaders = !closeSubReaders;
+  }
+
+ /** Add an IndexReader.
+  * @throws IOException if there is a low-level IO error
+  */
+  public void add(IndexReader reader) throws IOException
+  {
+    ensureOpen();
+    if (readers.size() == 0) {
+      this.maxDoc = reader.maxDoc();
+      this.numDocs = reader.numDocs();
+      this.hasDeletions = reader.hasDeletions();
+    }
+
+    if (reader.maxDoc() != maxDoc)                // check compatibility
+      throw new IllegalArgumentException
+        ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc());
+    if (reader.numDocs() != numDocs)
+      throw new IllegalArgumentException
+        ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs());
+
+    Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL);
+    Iterator i = fields.iterator();
+    while (i.hasNext()) {                         // update fieldToReader map
+      String field = (String)i.next();
+      if (fieldToReader.get(field) == null)
+        fieldToReader.put(field, reader);
+    }
+
+    readers.add(reader);
+    
+    if (incRefReaders) {
+      reader.incRef();
+    }
+    decrefOnClose.add(Boolean.valueOf(incRefReaders));
+  }
+
+  /**
+   * Tries to reopen the subreaders.
+   * <br>
+   * If one or more subreaders could be re-opened (i. e. subReader.reopen() 
+   * returned a new instance != subReader), then a new ArchiveParallelReader instance 
+   * is returned, otherwise this instance is returned.
+   * <p>
+   * A re-opened instance might share one or more subreaders with the old 
+   * instance. Index modification operations result in undefined behavior
+   * when performed before the old instance is closed.
+   * (see {@link IndexReader#reopen()}).
+   * <p>
+   * If subreaders are shared, then the reference count of those
+   * readers is increased to ensure that the subreaders remain open
+   * until the last referring reader is closed.
+   * 
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error 
+   */
+  public IndexReader reopen() throws CorruptIndexException, IOException {
+    ensureOpen();
+    
+    boolean reopened = false;
+    List newReaders = new ArrayList();
+    List newDecrefOnClose = new ArrayList();
+    
+    boolean success = false;
+    
+    try {
+    
+      for (int i = 0; i < readers.size(); i++) {
+        IndexReader oldReader = (IndexReader) readers.get(i);
+        IndexReader newReader = oldReader.reopen();
+        newReaders.add(newReader);
+        // if at least one of the subreaders was updated we remember that
+        // and return a new MultiReader
+        if (newReader != oldReader) {
+          reopened = true;
+        }
+      }
+  
+      if (reopened) {
+        ArchiveParallelReader pr = new ArchiveParallelReader();
+        for (int i = 0; i < readers.size(); i++) {
+          IndexReader oldReader = (IndexReader) readers.get(i);
+          IndexReader newReader = (IndexReader) newReaders.get(i);
+          if (newReader == oldReader) {
+            newDecrefOnClose.add(Boolean.TRUE);
+            newReader.incRef();
+          } else {
+            // this is a new subreader instance, so on close() we don't
+            // decRef but close it 
+            newDecrefOnClose.add(Boolean.FALSE);
+          }
+          pr.add(newReader);
+        }
+        pr.decrefOnClose = newDecrefOnClose;
+        pr.incRefReaders = incRefReaders;
+        success = true;
+        return pr;
+      } else {
+        success = true; 
+       // No subreader was refreshed
+        return this;
+      }
+    } finally {
+      if (!success && reopened) {
+        for (int i = 0; i < newReaders.size(); i++) {
+          IndexReader r = (IndexReader) newReaders.get(i);
+          if (r != null) {
+            try {
+              if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) {
+                r.decRef();
+              } else {
+                r.close();
+              }
+            } catch (IOException ignore) {
+              // keep going - we want to clean up as much as possible
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  public int numDocs() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return numDocs;
+  }
+
+  public int maxDoc() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return maxDoc;
+  }
+
+  public boolean hasDeletions() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return hasDeletions;
+  }
+
+  // check first reader
+  public boolean isDeleted(int n) {
+    // Don't call ensureOpen() here (it could affect performance)
+    if (readers.size() > 0)
+      return ((IndexReader)readers.get(0)).isDeleted(n);
+    return false;
+  }
+
+  // delete in all readers
+  protected void doDelete(int n) throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      ((IndexReader)readers.get(i)).deleteDocument(n);
+    }
+    hasDeletions = true;
+  }
+
+  /**
+   * @see org.apache.lucene.index.ParallelReader.doUndeleteAll
+   */
+  protected void doUndeleteAll() throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      ((IndexReader)readers.get(i)).undeleteAll();
+    }
+    hasDeletions = false;
+  }
+
+  /**
+   * <p><strong>ARCHIVE</strong> modification</p>
+   * <p>Return a <code>Document</code> with fields merged from parallel
+   * indices.  The values for a given field will <strong>only</strong>
+   * come from the first index that has the field.  This matches the
+   * searching behavior where a field is only searched in the first
+   * index that has the field.</p>
+   * <p>This differs from the bundled Lucene <code>ParallelReader</code>,
+   * which adds all vales from every index that has the field.</p>
+   * <p>The <code>fieldSelector<code> parameter is ignored.</p>
+   * <h3>Implementation Notes</h3>
+   * <p>Since getting the document from the reader is the expensive
+   * operation, we only get it once from each reader.  Once we've
+   * gotten the document from the reader, we iterate through the
+   * fields and only copy those fields that are mapped to the reader.</p>
+   * <p>The first implementation iterated through the field names,
+   * getting the document from the corresponding reader for each
+   * field name (10 fields =&gt; 10 document gets) which was a big
+   * performance hit.</p>
+   * <p>In this implementation, there are only as many document gets as
+   * there are readers.</p>
+   * @param n ordinal position of document to return
+   * @param fieldSelector ignored
+   * @return the document with field values assembled from parallel indicdes
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error
+   */
+  public Document document(int n, FieldSelector fieldSelector)
+    throws CorruptIndexException, IOException
+  {
+    ensureOpen();
+    Document result = new Document();
+
+    for ( IndexReader reader : (List<IndexReader>) readers )
+      {
+        Document d = reader.document( n );
+
+        for ( Fieldable f : ((List<Fieldable>) d.getFields()) )
+          {
+            if ( fieldToReader.get( f.name( ) ) == reader )
+              {
+                result.add( f );
+              }
+          }
+      }
+
+    return result;
+  }
+
+  // get all vectors
+  public TermFreqVector[] getTermFreqVectors(int n) throws IOException {
+    ensureOpen();
+    ArrayList results = new ArrayList();
+    Iterator i = fieldToReader.entrySet().iterator();
+    while (i.hasNext()) {
+      Map.Entry e = (Map.Entry)i.next();
+      String field = (String)e.getKey();
+      IndexReader reader = (IndexReader)e.getValue();
+      TermFreqVector vector = reader.getTermFreqVector(n, field);
+      if (vector != null)
+        results.add(vector);
+    }
+    return (TermFreqVector[])
+      results.toArray(new TermFreqVector[results.size()]);
+  }
+
+  public TermFreqVector getTermFreqVector(int n, String field)
+    throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? null : reader.getTermFreqVector(n, field);
+  }
+
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader != null) {
+      reader.getTermFreqVector(docNumber, field, mapper); 
+    }
+  }
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    ensureOpen();
+
+    Iterator i = fieldToReader.entrySet().iterator();
+    while (i.hasNext()) {
+      Map.Entry e = (Map.Entry)i.next();
+      String field = (String)e.getKey();
+      IndexReader reader = (IndexReader)e.getValue();
+      reader.getTermFreqVector(docNumber, field, mapper);
+    }
+
+  }
+
+  public boolean hasNorms(String field) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? false : reader.hasNorms(field);
+  }
+
+  public byte[] norms(String field) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? null : reader.norms(field);
+  }
+
+  public void norms(String field, byte[] result, int offset)
+    throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader!=null)
+      reader.norms(field, result, offset);
+  }
+
+  protected void doSetNorm(int n, String field, byte value)
+    throws CorruptIndexException, IOException {
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader!=null)
+      reader.doSetNorm(n, field, value);
+  }
+
+  public TermEnum terms() throws IOException {
+    ensureOpen();
+    return new ParallelTermEnum();
+  }
+
+  public TermEnum terms(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermEnum(term);
+  }
+
+  public int docFreq(Term term) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+    return reader==null ? 0 : reader.docFreq(term);
+  }
+
+  public TermDocs termDocs(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermDocs(term);
+  }
+
+  public TermDocs termDocs() throws IOException {
+    ensureOpen();
+    return new ParallelTermDocs();
+  }
+
+  public TermPositions termPositions(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermPositions(term);
+  }
+
+  public TermPositions termPositions() throws IOException {
+    ensureOpen();
+    return new ParallelTermPositions();
+  }
+  
+  /**
+   * Checks recursively if all subreaders are up to date. 
+   */
+  public boolean isCurrent() throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      if (!((IndexReader)readers.get(i)).isCurrent()) {
+        return false;
+      }
+    }
+    
+    // all subreaders are up to date
+    return true;
+  }
+
+  /**
+   * Checks recursively if all subindexes are optimized 
+   */
+  public boolean isOptimized() {
+    for (int i = 0; i < readers.size(); i++) {
+      if (!((IndexReader)readers.get(i)).isOptimized()) {
+        return false;
+      }
+    }
+    
+    // all subindexes are optimized
+    return true;
+  }
+
+  
+  /** Not implemented.
+   * @throws UnsupportedOperationException
+   */
+  public long getVersion() {
+    throw new UnsupportedOperationException("ArchiveParallelReader does not support this method.");
+  }
+
+  // for testing
+  IndexReader[] getSubReaders() {
+    return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]);
+  }
+
+  protected void doCommit() throws IOException {
+    for (int i = 0; i < readers.size(); i++)
+      ((IndexReader)readers.get(i)).commit();
+  }
+
+  protected synchronized void doClose() throws IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      if (((Boolean) decrefOnClose.get(i)).booleanValue()) {
+        ((IndexReader)readers.get(i)).decRef();
+      } else {
+        ((IndexReader)readers.get(i)).close();
+      }
+    }
+  }
+
+  public Collection getFieldNames (IndexReader.FieldOption fieldNames) {
+    ensureOpen();
+    Set fieldSet = new HashSet();
+    for (int i = 0; i < readers.size(); i++) {
+      IndexReader reader = ((IndexReader)readers.get(i));
+      Collection names = reader.getFieldNames(fieldNames);
+      fieldSet.addAll(names);
+    }
+    return fieldSet;
+  }
+
+  private class ParallelTermEnum extends TermEnum {
+    private String field;
+    private Iterator fieldIterator;
+    private TermEnum termEnum;
+
+    public ParallelTermEnum() throws IOException {
+      field = (String)fieldToReader.firstKey();
+      if (field != null)
+        termEnum = ((IndexReader)fieldToReader.get(field)).terms();
+    }
+
+    public ParallelTermEnum(Term term) throws IOException {
+      field = term.field();
+      IndexReader reader = ((IndexReader)fieldToReader.get(field));
+      if (reader!=null)
+        termEnum = reader.terms(term);
+    }
+
+    public boolean next() throws IOException {
+      if (termEnum==null)
+        return false;
+
+      // another term in this field?
+      if (termEnum.next() && termEnum.term().field()==field)
+        return true;                              // yes, keep going
+
+      termEnum.close();                           // close old termEnum
+
+      // find the next field with terms, if any
+      if (fieldIterator==null) {
+        fieldIterator = fieldToReader.tailMap(field).keySet().iterator();
+        fieldIterator.next();                     // Skip field to get next one
+      }
+      while (fieldIterator.hasNext()) {
+        field = (String) fieldIterator.next();
+        termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, ""));
+        Term term = termEnum.term();
+        if (term!=null && term.field()==field)
+          return true;
+        else
+          termEnum.close();
+      }
+ 
+      return false;                               // no more fields
+    }
+
+    public Term term() {
+      if (termEnum==null)
+        return null;
+
+      return termEnum.term();
+    }
+
+    public int docFreq() {
+      if (termEnum==null)
+        return 0;
+
+      return termEnum.docFreq();
+    }
+
+    public void close() throws IOException {
+      if (termEnum!=null)
+        termEnum.close();
+    }
+
+  }
+
+  // wrap a TermDocs in order to support seek(Term)
+  private class ParallelTermDocs implements TermDocs {
+    protected TermDocs termDocs;
+
+    public ParallelTermDocs() {}
+    public ParallelTermDocs(Term term) throws IOException { seek(term); }
+
+    public int doc() { return termDocs.doc(); }
+    public int freq() { return termDocs.freq(); }
+
+    public void seek(Term term) throws IOException {
+      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      termDocs = reader!=null ? reader.termDocs(term) : null;
+    }
+
+    public void seek(TermEnum termEnum) throws IOException {
+      seek(termEnum.term());
+    }
+
+    public boolean next() throws IOException {
+      if (termDocs==null)
+        return false;
+
+      return termDocs.next();
+    }
+
+    public int read(final int[] docs, final int[] freqs) throws IOException {
+      if (termDocs==null)
+        return 0;
+
+      return termDocs.read(docs, freqs);
+    }
+
+    public boolean skipTo(int target) throws IOException {
+      if (termDocs==null)
+        return false;
+
+      return termDocs.skipTo(target);
+    }
+
+    public void close() throws IOException {
+      if (termDocs!=null)
+        termDocs.close();
+    }
+
+  }
+
+  private class ParallelTermPositions
+    extends ParallelTermDocs implements TermPositions {
+
+    public ParallelTermPositions() {}
+    public ParallelTermPositions(Term term) throws IOException { seek(term); }
+
+    public void seek(Term term) throws IOException {
+      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      termDocs = reader!=null ? reader.termPositions(term) : null;
+    }
+
+    public int nextPosition() throws IOException {
+      // It is an error to call this if there is no next position, e.g. if termDocs==null
+      return ((TermPositions)termDocs).nextPosition();
+    }
+
+    public int getPayloadLength() {
+      return ((TermPositions)termDocs).getPayloadLength();
+    }
+
+    public byte[] getPayload(byte[] data, int offset) throws IOException {
+      return ((TermPositions)termDocs).getPayload(data, offset);
+    }
+
+
+    // TODO: Remove warning after API has been finalized
+    public boolean isPayloadAvailable() {
+      return ((TermPositions) termDocs).isPayloadAvailable();
+    }
+  }
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




Thread: [Archive-access-cvs] SF.net SVN: archive-access: [2327] trunk/archive-access/projects/nutchwax/ arc

archive-access-cvs