[Archive-access-cvs] SF.net SVN: archive-access:[2860] trunk/archive-access/projects/nutchwax/ arch

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2860
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2860&view=rev
Author:   binzino
Date:     2009-10-28 21:24:26 +0000 (Wed, 28 Oct 2009)

Log Message:
-----------
Moved to Nutch source overlay so that edits in Nutch sources can access this class.

Added Paths:
-----------
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/
    trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java

Removed Paths:
-------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java

Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java
===================================================================

--- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2009-10-28 03:40:14 UTC (rev 2859)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2009-10-28 21:24:26 UTC (rev 2860)
@@ -1,616 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/** 
- * ARCHIVE: This must be in the lucene index package because it needs
- *          to call protected methods on other IndexReader objects.
- */
-package org.apache.lucene.index;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.document.FieldSelectorResult;
-import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.index.CorruptIndexException;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositions;
-import org.apache.lucene.index.TermVectorMapper;
-
-import java.io.IOException;
-import java.util.*;
-
-
-/** An IndexReader which reads multiple, parallel indexes.  Each index added
- * must have the same number of documents, but typically each contains
- * different fields.  Each document contains the union of the fields of all
- * documents with the same document number.  When searching, matches for a
- * query term are from the first index added that has the field.
- *
- * <p>This is useful, e.g., with collections that have large fields which
- * change rarely and small fields that change more frequently.  The smaller
- * fields may be re-indexed in a new index and both indexes may be searched
- * together.
- *
- * <p><strong>Warning:</strong> It is up to you to make sure all indexes
- * are created and modified the same way. For example, if you add
- * documents to one index, you need to add the same documents in the
- * same order to the other indexes. <em>Failure to do so will result in
- * undefined behavior</em>.
- */
-public class ArchiveParallelReader extends IndexReader {
-  private List readers = new ArrayList();
-  private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close
-  boolean incRefReaders = false;
-  private SortedMap fieldToReader = new TreeMap();
-
-  private int maxDoc;
-  private int numDocs;
-  private boolean hasDeletions;
-
- /** Construct a ArchiveParallelReader. 
-  * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p>
-  */
-  public ArchiveParallelReader() throws IOException { this(true); }
-   
- /** Construct a ArchiveParallelReader. 
-  * @param closeSubReaders indicates whether the subreaders should be closed
-  * when this ArchiveParallelReader is closed
-  */
-  public ArchiveParallelReader(boolean closeSubReaders) throws IOException {
-    super();
-    this.incRefReaders = !closeSubReaders;
-  }
-
- /** Add an IndexReader.
-  * @throws IOException if there is a low-level IO error
-  */
-  public void add(IndexReader reader) throws IOException
-  {
-    ensureOpen();
-    if (readers.size() == 0) {
-      this.maxDoc = reader.maxDoc();
-      this.numDocs = reader.numDocs();
-      this.hasDeletions = reader.hasDeletions();
-    }
-
-    if (reader.maxDoc() != maxDoc)                // check compatibility
-      throw new IllegalArgumentException
-        ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc());
-    if (reader.numDocs() != numDocs)
-      throw new IllegalArgumentException
-        ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs());
-
-    Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL);
-    Iterator i = fields.iterator();
-    while (i.hasNext()) {                         // update fieldToReader map
-      String field = (String)i.next();
-      if (fieldToReader.get(field) == null)
-        fieldToReader.put(field, reader);
-    }
-
-    readers.add(reader);
-    
-    if (incRefReaders) {
-      reader.incRef();
-    }
-    decrefOnClose.add(Boolean.valueOf(incRefReaders));
-  }
-
-  /**
-   * Tries to reopen the subreaders.
-   * <br>
-   * If one or more subreaders could be re-opened (i. e. subReader.reopen() 
-   * returned a new instance != subReader), then a new ArchiveParallelReader instance 
-   * is returned, otherwise this instance is returned.
-   * <p>
-   * A re-opened instance might share one or more subreaders with the old 
-   * instance. Index modification operations result in undefined behavior
-   * when performed before the old instance is closed.
-   * (see {@link IndexReader#reopen()}).
-   * <p>
-   * If subreaders are shared, then the reference count of those
-   * readers is increased to ensure that the subreaders remain open
-   * until the last referring reader is closed.
-   * 
-   * @throws CorruptIndexException if the index is corrupt
-   * @throws IOException if there is a low-level IO error 
-   */
-  public IndexReader reopen() throws CorruptIndexException, IOException {
-    ensureOpen();
-    
-    boolean reopened = false;
-    List newReaders = new ArrayList();
-    List newDecrefOnClose = new ArrayList();
-    
-    boolean success = false;
-    
-    try {
-    
-      for (int i = 0; i < readers.size(); i++) {
-        IndexReader oldReader = (IndexReader) readers.get(i);
-        IndexReader newReader = oldReader.reopen();
-        newReaders.add(newReader);
-        // if at least one of the subreaders was updated we remember that
-        // and return a new MultiReader
-        if (newReader != oldReader) {
-          reopened = true;
-        }
-      }
-  
-      if (reopened) {
-        ArchiveParallelReader pr = new ArchiveParallelReader();
-        for (int i = 0; i < readers.size(); i++) {
-          IndexReader oldReader = (IndexReader) readers.get(i);
-          IndexReader newReader = (IndexReader) newReaders.get(i);
-          if (newReader == oldReader) {
-            newDecrefOnClose.add(Boolean.TRUE);
-            newReader.incRef();
-          } else {
-            // this is a new subreader instance, so on close() we don't
-            // decRef but close it 
-            newDecrefOnClose.add(Boolean.FALSE);
-          }
-          pr.add(newReader);
-        }
-        pr.decrefOnClose = newDecrefOnClose;
-        pr.incRefReaders = incRefReaders;
-        success = true;
-        return pr;
-      } else {
-        success = true; 
-       // No subreader was refreshed
-        return this;
-      }
-    } finally {
-      if (!success && reopened) {
-        for (int i = 0; i < newReaders.size(); i++) {
-          IndexReader r = (IndexReader) newReaders.get(i);
-          if (r != null) {
-            try {
-              if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) {
-                r.decRef();
-              } else {
-                r.close();
-              }
-            } catch (IOException ignore) {
-              // keep going - we want to clean up as much as possible
-            }
-          }
-        }
-      }
-    }
-  }
-
-
-  public int numDocs() {
-    // Don't call ensureOpen() here (it could affect performance)
-    return numDocs;
-  }
-
-  public int maxDoc() {
-    // Don't call ensureOpen() here (it could affect performance)
-    return maxDoc;
-  }
-
-  public boolean hasDeletions() {
-    // Don't call ensureOpen() here (it could affect performance)
-    return hasDeletions;
-  }
-
-  // check first reader
-  public boolean isDeleted(int n) {
-    // Don't call ensureOpen() here (it could affect performance)
-    if (readers.size() > 0)
-      return ((IndexReader)readers.get(0)).isDeleted(n);
-    return false;
-  }
-
-  // delete in all readers
-  protected void doDelete(int n) throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      ((IndexReader)readers.get(i)).deleteDocument(n);
-    }
-    hasDeletions = true;
-  }
-
-  /**
-   * @see org.apache.lucene.index.ParallelReader.doUndeleteAll
-   */
-  protected void doUndeleteAll() throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      ((IndexReader)readers.get(i)).undeleteAll();
-    }
-    hasDeletions = false;
-  }
-
-  /**
-   * <p><strong>ARCHIVE</strong> modification</p>
-   * <p>Return a <code>Document</code> with fields merged from parallel
-   * indices.  The values for a given field will <strong>only</strong>
-   * come from the first index that has the field.  This matches the
-   * searching behavior where a field is only searched in the first
-   * index that has the field.</p>
-   * <p>This differs from the bundled Lucene <code>ParallelReader</code>,
-   * which adds all values from every index that has the field.</p>
-   * <p>The <code>fieldSelector<code> parameter is ignored.</p>
-   * <h3>Implementation Notes</h3>
-   * <p>Since getting the document from the reader is the expensive
-   * operation, we only get it once from each reader.  Once we've
-   * gotten the document from the reader, we iterate through the
-   * fields and only copy those fields that are mapped to the reader.</p>
-   * <p>The first implementation iterated through the field names,
-   * getting the document from the corresponding reader for each
-   * field name (10 fields =&gt; 10 document gets) which was a big
-   * performance hit.</p>
-   * <p>In this implementation, there are only as many document gets as
-   * there are readers.</p>
-   * @param n ordinal position of document to return
-   * @param fieldSelector ignored
-   * @return the document with field values assembled from parallel indicdes
-   * @throws CorruptIndexException if the index is corrupt
-   * @throws IOException if there is a low-level IO error
-   */
-  public Document document(int n, FieldSelector fieldSelector)
-    throws CorruptIndexException, IOException
-  {
-    ensureOpen();
-    Document result = new Document();
-
-    for ( IndexReader reader : (List<IndexReader>) readers )
-      {
-        Document d = reader.document( n );
-
-        for ( Fieldable f : ((List<Fieldable>) d.getFields()) )
-          {
-            if ( fieldToReader.get( f.name( ) ) == reader )
-              {
-                result.add( f );
-              }
-          }
-      }
-
-    return result;
-  }
-
-  // get all vectors
-  public TermFreqVector[] getTermFreqVectors(int n) throws IOException {
-    ensureOpen();
-    ArrayList results = new ArrayList();
-    Iterator i = fieldToReader.entrySet().iterator();
-    while (i.hasNext()) {
-      Map.Entry e = (Map.Entry)i.next();
-      String field = (String)e.getKey();
-      IndexReader reader = (IndexReader)e.getValue();
-      TermFreqVector vector = reader.getTermFreqVector(n, field);
-      if (vector != null)
-        results.add(vector);
-    }
-    return (TermFreqVector[])
-      results.toArray(new TermFreqVector[results.size()]);
-  }
-
-  public TermFreqVector getTermFreqVector(int n, String field)
-    throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    return reader==null ? null : reader.getTermFreqVector(n, field);
-  }
-
-
-  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    if (reader != null) {
-      reader.getTermFreqVector(docNumber, field, mapper); 
-    }
-  }
-
-  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
-    ensureOpen();
-    ensureOpen();
-
-    Iterator i = fieldToReader.entrySet().iterator();
-    while (i.hasNext()) {
-      Map.Entry e = (Map.Entry)i.next();
-      String field = (String)e.getKey();
-      IndexReader reader = (IndexReader)e.getValue();
-      reader.getTermFreqVector(docNumber, field, mapper);
-    }
-
-  }
-
-  public boolean hasNorms(String field) throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    return reader==null ? false : reader.hasNorms(field);
-  }
-
-  public byte[] norms(String field) throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    return reader==null ? null : reader.norms(field);
-  }
-
-  public void norms(String field, byte[] result, int offset)
-    throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    if (reader!=null)
-      reader.norms(field, result, offset);
-  }
-
-  protected void doSetNorm(int n, String field, byte value)
-    throws CorruptIndexException, IOException {
-    IndexReader reader = ((IndexReader)fieldToReader.get(field));
-    if (reader!=null)
-      reader.doSetNorm(n, field, value);
-  }
-
-  public TermEnum terms() throws IOException {
-    ensureOpen();
-    return new ParallelTermEnum();
-  }
-
-  public TermEnum terms(Term term) throws IOException {
-    ensureOpen();
-    return new ParallelTermEnum(term);
-  }
-
-  public int docFreq(Term term) throws IOException {
-    ensureOpen();
-    IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
-    return reader==null ? 0 : reader.docFreq(term);
-  }
-
-  public TermDocs termDocs(Term term) throws IOException {
-    ensureOpen();
-    return new ParallelTermDocs(term);
-  }
-
-  public TermDocs termDocs() throws IOException {
-    ensureOpen();
-    return new ParallelTermDocs();
-  }
-
-  public TermPositions termPositions(Term term) throws IOException {
-    ensureOpen();
-    return new ParallelTermPositions(term);
-  }
-
-  public TermPositions termPositions() throws IOException {
-    ensureOpen();
-    return new ParallelTermPositions();
-  }
-  
-  /**
-   * Checks recursively if all subreaders are up to date. 
-   */
-  public boolean isCurrent() throws CorruptIndexException, IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      if (!((IndexReader)readers.get(i)).isCurrent()) {
-        return false;
-      }
-    }
-    
-    // all subreaders are up to date
-    return true;
-  }
-
-  /**
-   * Checks recursively if all subindexes are optimized 
-   */
-  public boolean isOptimized() {
-    for (int i = 0; i < readers.size(); i++) {
-      if (!((IndexReader)readers.get(i)).isOptimized()) {
-        return false;
-      }
-    }
-    
-    // all subindexes are optimized
-    return true;
-  }
-
-  
-  /** Not implemented.
-   * @throws UnsupportedOperationException
-   */
-  public long getVersion() {
-    throw new UnsupportedOperationException("ArchiveParallelReader does not support this method.");
-  }
-
-  // for testing
-  IndexReader[] getSubReaders() {
-    return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]);
-  }
-
-  protected void doCommit() throws IOException {
-    for (int i = 0; i < readers.size(); i++)
-      ((IndexReader)readers.get(i)).commit();
-  }
-
-  protected synchronized void doClose() throws IOException {
-    for (int i = 0; i < readers.size(); i++) {
-      if (((Boolean) decrefOnClose.get(i)).booleanValue()) {
-        ((IndexReader)readers.get(i)).decRef();
-      } else {
-        ((IndexReader)readers.get(i)).close();
-      }
-    }
-  }
-
-  public Collection getFieldNames (IndexReader.FieldOption fieldNames) {
-    ensureOpen();
-    Set fieldSet = new HashSet();
-    for (int i = 0; i < readers.size(); i++) {
-      IndexReader reader = ((IndexReader)readers.get(i));
-      Collection names = reader.getFieldNames(fieldNames);
-      fieldSet.addAll(names);
-    }
-    return fieldSet;
-  }
-
-  private class ParallelTermEnum extends TermEnum {
-    private String field;
-    private Iterator fieldIterator;
-    private TermEnum termEnum;
-
-    public ParallelTermEnum() throws IOException {
-      if ( fieldToReader.isEmpty( ) ) return ;
-
-      field = (String)fieldToReader.firstKey();
-      if (field != null)
-        termEnum = ((IndexReader)fieldToReader.get(field)).terms();
-    }
-
-    public ParallelTermEnum(Term term) throws IOException {
-      field = term.field();
-      IndexReader reader = ((IndexReader)fieldToReader.get(field));
-      if (reader!=null)
-        termEnum = reader.terms(term);
-    }
-
-    public boolean next() throws IOException {
-      if (termEnum==null)
-        return false;
-
-      // another term in this field?
-      if (termEnum.next() && termEnum.term().field()==field)
-        return true;                              // yes, keep going
-
-      termEnum.close();                           // close old termEnum
-
-      // find the next field with terms, if any
-      if (fieldIterator==null) {
-        fieldIterator = fieldToReader.tailMap(field).keySet().iterator();
-        fieldIterator.next();                     // Skip field to get next one
-      }
-      while (fieldIterator.hasNext()) {
-        field = (String) fieldIterator.next();
-        termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, ""));
-        Term term = termEnum.term();
-        if (term!=null && term.field()==field)
-          return true;
-        else
-          termEnum.close();
-      }
- 
-      return false;                               // no more fields
-    }
-
-    public Term term() {
-      if (termEnum==null)
-        return null;
-
-      return termEnum.term();
-    }
-
-    public int docFreq() {
-      if (termEnum==null)
-        return 0;
-
-      return termEnum.docFreq();
-    }
-
-    public void close() throws IOException {
-      if (termEnum!=null)
-        termEnum.close();
-    }
-
-  }
-
-  // wrap a TermDocs in order to support seek(Term)
-  private class ParallelTermDocs implements TermDocs {
-    protected TermDocs termDocs;
-
-    public ParallelTermDocs() {}
-    public ParallelTermDocs(Term term) throws IOException { seek(term); }
-
-    public int doc() { return termDocs.doc(); }
-    public int freq() { return termDocs.freq(); }
-
-    public void seek(Term term) throws IOException {
-      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
-      termDocs = reader!=null ? reader.termDocs(term) : null;
-    }
-
-    public void seek(TermEnum termEnum) throws IOException {
-      seek(termEnum.term());
-    }
-
-    public boolean next() throws IOException {
-      if (termDocs==null)
-        return false;
-
-      return termDocs.next();
-    }
-
-    public int read(final int[] docs, final int[] freqs) throws IOException {
-      if (termDocs==null)
-        return 0;
-
-      return termDocs.read(docs, freqs);
-    }
-
-    public boolean skipTo(int target) throws IOException {
-      if (termDocs==null)
-        return false;
-
-      return termDocs.skipTo(target);
-    }
-
-    public void close() throws IOException {
-      if (termDocs!=null)
-        termDocs.close();
-    }
-
-  }
-
-  private class ParallelTermPositions
-    extends ParallelTermDocs implements TermPositions {
-
-    public ParallelTermPositions() {}
-    public ParallelTermPositions(Term term) throws IOException { seek(term); }
-
-    public void seek(Term term) throws IOException {
-      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
-      termDocs = reader!=null ? reader.termPositions(term) : null;
-    }
-
-    public int nextPosition() throws IOException {
-      // It is an error to call this if there is no next position, e.g. if termDocs==null
-      return ((TermPositions)termDocs).nextPosition();
-    }
-
-    public int getPayloadLength() {
-      return ((TermPositions)termDocs).getPayloadLength();
-    }
-
-    public byte[] getPayload(byte[] data, int offset) throws IOException {
-      return ((TermPositions)termDocs).getPayload(data, offset);
-    }
-
-
-    // TODO: Remove warning after API has been finalized
-    public boolean isPayloadAvailable() {
-      return ((TermPositions) termDocs).isPayloadAvailable();
-    }
-  }
-
-}

Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java	                        (rev 0)
+++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/lucene/index/ArchiveParallelReader.java	2009-10-28 21:24:26 UTC (rev 2860)
@@ -0,0 +1,616 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** 
+ * ARCHIVE: This must be in the lucene index package because it needs
+ *          to call protected methods on other IndexReader objects.
+ */
+package org.apache.lucene.index;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorMapper;
+
+import java.io.IOException;
+import java.util.*;
+
+
+/** An IndexReader which reads multiple, parallel indexes.  Each index added
+ * must have the same number of documents, but typically each contains
+ * different fields.  Each document contains the union of the fields of all
+ * documents with the same document number.  When searching, matches for a
+ * query term are from the first index added that has the field.
+ *
+ * <p>This is useful, e.g., with collections that have large fields which
+ * change rarely and small fields that change more frequently.  The smaller
+ * fields may be re-indexed in a new index and both indexes may be searched
+ * together.
+ *
+ * <p><strong>Warning:</strong> It is up to you to make sure all indexes
+ * are created and modified the same way. For example, if you add
+ * documents to one index, you need to add the same documents in the
+ * same order to the other indexes. <em>Failure to do so will result in
+ * undefined behavior</em>.
+ */
+public class ArchiveParallelReader extends IndexReader {
+  private List readers = new ArrayList();
+  private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close
+  boolean incRefReaders = false;
+  private SortedMap fieldToReader = new TreeMap();
+
+  private int maxDoc;
+  private int numDocs;
+  private boolean hasDeletions;
+
+ /** Construct a ArchiveParallelReader. 
+  * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p>
+  */
+  public ArchiveParallelReader() throws IOException { this(true); }
+   
+ /** Construct a ArchiveParallelReader. 
+  * @param closeSubReaders indicates whether the subreaders should be closed
+  * when this ArchiveParallelReader is closed
+  */
+  public ArchiveParallelReader(boolean closeSubReaders) throws IOException {
+    super();
+    this.incRefReaders = !closeSubReaders;
+  }
+
+ /** Add an IndexReader.
+  * @throws IOException if there is a low-level IO error
+  */
+  public void add(IndexReader reader) throws IOException
+  {
+    ensureOpen();
+    if (readers.size() == 0) {
+      this.maxDoc = reader.maxDoc();
+      this.numDocs = reader.numDocs();
+      this.hasDeletions = reader.hasDeletions();
+    }
+
+    if (reader.maxDoc() != maxDoc)                // check compatibility
+      throw new IllegalArgumentException
+        ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc());
+    if (reader.numDocs() != numDocs)
+      throw new IllegalArgumentException
+        ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs());
+
+    Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL);
+    Iterator i = fields.iterator();
+    while (i.hasNext()) {                         // update fieldToReader map
+      String field = (String)i.next();
+      if (fieldToReader.get(field) == null)
+        fieldToReader.put(field, reader);
+    }
+
+    readers.add(reader);
+    
+    if (incRefReaders) {
+      reader.incRef();
+    }
+    decrefOnClose.add(Boolean.valueOf(incRefReaders));
+  }
+
+  /**
+   * Tries to reopen the subreaders.
+   * <br>
+   * If one or more subreaders could be re-opened (i. e. subReader.reopen() 
+   * returned a new instance != subReader), then a new ArchiveParallelReader instance 
+   * is returned, otherwise this instance is returned.
+   * <p>
+   * A re-opened instance might share one or more subreaders with the old 
+   * instance. Index modification operations result in undefined behavior
+   * when performed before the old instance is closed.
+   * (see {@link IndexReader#reopen()}).
+   * <p>
+   * If subreaders are shared, then the reference count of those
+   * readers is increased to ensure that the subreaders remain open
+   * until the last referring reader is closed.
+   * 
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error 
+   */
+  public IndexReader reopen() throws CorruptIndexException, IOException {
+    ensureOpen();
+    
+    boolean reopened = false;
+    List newReaders = new ArrayList();
+    List newDecrefOnClose = new ArrayList();
+    
+    boolean success = false;
+    
+    try {
+    
+      for (int i = 0; i < readers.size(); i++) {
+        IndexReader oldReader = (IndexReader) readers.get(i);
+        IndexReader newReader = oldReader.reopen();
+        newReaders.add(newReader);
+        // if at least one of the subreaders was updated we remember that
+        // and return a new MultiReader
+        if (newReader != oldReader) {
+          reopened = true;
+        }
+      }
+  
+      if (reopened) {
+        ArchiveParallelReader pr = new ArchiveParallelReader();
+        for (int i = 0; i < readers.size(); i++) {
+          IndexReader oldReader = (IndexReader) readers.get(i);
+          IndexReader newReader = (IndexReader) newReaders.get(i);
+          if (newReader == oldReader) {
+            newDecrefOnClose.add(Boolean.TRUE);
+            newReader.incRef();
+          } else {
+            // this is a new subreader instance, so on close() we don't
+            // decRef but close it 
+            newDecrefOnClose.add(Boolean.FALSE);
+          }
+          pr.add(newReader);
+        }
+        pr.decrefOnClose = newDecrefOnClose;
+        pr.incRefReaders = incRefReaders;
+        success = true;
+        return pr;
+      } else {
+        success = true; 
+       // No subreader was refreshed
+        return this;
+      }
+    } finally {
+      if (!success && reopened) {
+        for (int i = 0; i < newReaders.size(); i++) {
+          IndexReader r = (IndexReader) newReaders.get(i);
+          if (r != null) {
+            try {
+              if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) {
+                r.decRef();
+              } else {
+                r.close();
+              }
+            } catch (IOException ignore) {
+              // keep going - we want to clean up as much as possible
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  public int numDocs() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return numDocs;
+  }
+
+  public int maxDoc() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return maxDoc;
+  }
+
+  public boolean hasDeletions() {
+    // Don't call ensureOpen() here (it could affect performance)
+    return hasDeletions;
+  }
+
+  // check first reader
+  public boolean isDeleted(int n) {
+    // Don't call ensureOpen() here (it could affect performance)
+    if (readers.size() > 0)
+      return ((IndexReader)readers.get(0)).isDeleted(n);
+    return false;
+  }
+
+  // delete in all readers
+  protected void doDelete(int n) throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      ((IndexReader)readers.get(i)).deleteDocument(n);
+    }
+    hasDeletions = true;
+  }
+
+  /**
+   * @see org.apache.lucene.index.ParallelReader.doUndeleteAll
+   */
+  protected void doUndeleteAll() throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      ((IndexReader)readers.get(i)).undeleteAll();
+    }
+    hasDeletions = false;
+  }
+
+  /**
+   * <p><strong>ARCHIVE</strong> modification</p>
+   * <p>Return a <code>Document</code> with fields merged from parallel
+   * indices.  The values for a given field will <strong>only</strong>
+   * come from the first index that has the field.  This matches the
+   * searching behavior where a field is only searched in the first
+   * index that has the field.</p>
+   * <p>This differs from the bundled Lucene <code>ParallelReader</code>,
+   * which adds all values from every index that has the field.</p>
+   * <p>The <code>fieldSelector<code> parameter is ignored.</p>
+   * <h3>Implementation Notes</h3>
+   * <p>Since getting the document from the reader is the expensive
+   * operation, we only get it once from each reader.  Once we've
+   * gotten the document from the reader, we iterate through the
+   * fields and only copy those fields that are mapped to the reader.</p>
+   * <p>The first implementation iterated through the field names,
+   * getting the document from the corresponding reader for each
+   * field name (10 fields =&gt; 10 document gets) which was a big
+   * performance hit.</p>
+   * <p>In this implementation, there are only as many document gets as
+   * there are readers.</p>
+   * @param n ordinal position of document to return
+   * @param fieldSelector ignored
+   * @return the document with field values assembled from parallel indicdes
+   * @throws CorruptIndexException if the index is corrupt
+   * @throws IOException if there is a low-level IO error
+   */
+  public Document document(int n, FieldSelector fieldSelector)
+    throws CorruptIndexException, IOException
+  {
+    ensureOpen();
+    Document result = new Document();
+
+    for ( IndexReader reader : (List<IndexReader>) readers )
+      {
+        Document d = reader.document( n );
+
+        for ( Fieldable f : ((List<Fieldable>) d.getFields()) )
+          {
+            if ( fieldToReader.get( f.name( ) ) == reader )
+              {
+                result.add( f );
+              }
+          }
+      }
+
+    return result;
+  }
+
+  // get all vectors
+  public TermFreqVector[] getTermFreqVectors(int n) throws IOException {
+    ensureOpen();
+    ArrayList results = new ArrayList();
+    Iterator i = fieldToReader.entrySet().iterator();
+    while (i.hasNext()) {
+      Map.Entry e = (Map.Entry)i.next();
+      String field = (String)e.getKey();
+      IndexReader reader = (IndexReader)e.getValue();
+      TermFreqVector vector = reader.getTermFreqVector(n, field);
+      if (vector != null)
+        results.add(vector);
+    }
+    return (TermFreqVector[])
+      results.toArray(new TermFreqVector[results.size()]);
+  }
+
+  public TermFreqVector getTermFreqVector(int n, String field)
+    throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? null : reader.getTermFreqVector(n, field);
+  }
+
+
+  public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader != null) {
+      reader.getTermFreqVector(docNumber, field, mapper); 
+    }
+  }
+
+  public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+    ensureOpen();
+    ensureOpen();
+
+    Iterator i = fieldToReader.entrySet().iterator();
+    while (i.hasNext()) {
+      Map.Entry e = (Map.Entry)i.next();
+      String field = (String)e.getKey();
+      IndexReader reader = (IndexReader)e.getValue();
+      reader.getTermFreqVector(docNumber, field, mapper);
+    }
+
+  }
+
+  public boolean hasNorms(String field) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? false : reader.hasNorms(field);
+  }
+
+  public byte[] norms(String field) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    return reader==null ? null : reader.norms(field);
+  }
+
+  public void norms(String field, byte[] result, int offset)
+    throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader!=null)
+      reader.norms(field, result, offset);
+  }
+
+  protected void doSetNorm(int n, String field, byte value)
+    throws CorruptIndexException, IOException {
+    IndexReader reader = ((IndexReader)fieldToReader.get(field));
+    if (reader!=null)
+      reader.doSetNorm(n, field, value);
+  }
+
+  public TermEnum terms() throws IOException {
+    ensureOpen();
+    return new ParallelTermEnum();
+  }
+
+  public TermEnum terms(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermEnum(term);
+  }
+
+  public int docFreq(Term term) throws IOException {
+    ensureOpen();
+    IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+    return reader==null ? 0 : reader.docFreq(term);
+  }
+
+  public TermDocs termDocs(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermDocs(term);
+  }
+
+  public TermDocs termDocs() throws IOException {
+    ensureOpen();
+    return new ParallelTermDocs();
+  }
+
+  public TermPositions termPositions(Term term) throws IOException {
+    ensureOpen();
+    return new ParallelTermPositions(term);
+  }
+
+  public TermPositions termPositions() throws IOException {
+    ensureOpen();
+    return new ParallelTermPositions();
+  }
+  
+  /**
+   * Checks recursively if all subreaders are up to date. 
+   */
+  public boolean isCurrent() throws CorruptIndexException, IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      if (!((IndexReader)readers.get(i)).isCurrent()) {
+        return false;
+      }
+    }
+    
+    // all subreaders are up to date
+    return true;
+  }
+
+  /**
+   * Checks recursively if all subindexes are optimized 
+   */
+  public boolean isOptimized() {
+    for (int i = 0; i < readers.size(); i++) {
+      if (!((IndexReader)readers.get(i)).isOptimized()) {
+        return false;
+      }
+    }
+    
+    // all subindexes are optimized
+    return true;
+  }
+
+  
+  /** Not implemented.
+   * @throws UnsupportedOperationException
+   */
+  public long getVersion() {
+    throw new UnsupportedOperationException("ArchiveParallelReader does not support this method.");
+  }
+
+  // for testing
+  IndexReader[] getSubReaders() {
+    return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]);
+  }
+
+  protected void doCommit() throws IOException {
+    for (int i = 0; i < readers.size(); i++)
+      ((IndexReader)readers.get(i)).commit();
+  }
+
+  protected synchronized void doClose() throws IOException {
+    for (int i = 0; i < readers.size(); i++) {
+      if (((Boolean) decrefOnClose.get(i)).booleanValue()) {
+        ((IndexReader)readers.get(i)).decRef();
+      } else {
+        ((IndexReader)readers.get(i)).close();
+      }
+    }
+  }
+
+  public Collection getFieldNames (IndexReader.FieldOption fieldNames) {
+    ensureOpen();
+    Set fieldSet = new HashSet();
+    for (int i = 0; i < readers.size(); i++) {
+      IndexReader reader = ((IndexReader)readers.get(i));
+      Collection names = reader.getFieldNames(fieldNames);
+      fieldSet.addAll(names);
+    }
+    return fieldSet;
+  }
+
+  private class ParallelTermEnum extends TermEnum {
+    private String field;
+    private Iterator fieldIterator;
+    private TermEnum termEnum;
+
+    public ParallelTermEnum() throws IOException {
+      if ( fieldToReader.isEmpty( ) ) return ;
+
+      field = (String)fieldToReader.firstKey();
+      if (field != null)
+        termEnum = ((IndexReader)fieldToReader.get(field)).terms();
+    }
+
+    public ParallelTermEnum(Term term) throws IOException {
+      field = term.field();
+      IndexReader reader = ((IndexReader)fieldToReader.get(field));
+      if (reader!=null)
+        termEnum = reader.terms(term);
+    }
+
+    public boolean next() throws IOException {
+      if (termEnum==null)
+        return false;
+
+      // another term in this field?
+      if (termEnum.next() && termEnum.term().field()==field)
+        return true;                              // yes, keep going
+
+      termEnum.close();                           // close old termEnum
+
+      // find the next field with terms, if any
+      if (fieldIterator==null) {
+        fieldIterator = fieldToReader.tailMap(field).keySet().iterator();
+        fieldIterator.next();                     // Skip field to get next one
+      }
+      while (fieldIterator.hasNext()) {
+        field = (String) fieldIterator.next();
+        termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, ""));
+        Term term = termEnum.term();
+        if (term!=null && term.field()==field)
+          return true;
+        else
+          termEnum.close();
+      }
+ 
+      return false;                               // no more fields
+    }
+
+    public Term term() {
+      if (termEnum==null)
+        return null;
+
+      return termEnum.term();
+    }
+
+    public int docFreq() {
+      if (termEnum==null)
+        return 0;
+
+      return termEnum.docFreq();
+    }
+
+    public void close() throws IOException {
+      if (termEnum!=null)
+        termEnum.close();
+    }
+
+  }
+
+  // wrap a TermDocs in order to support seek(Term)
+  private class ParallelTermDocs implements TermDocs {
+    protected TermDocs termDocs;
+
+    public ParallelTermDocs() {}
+    public ParallelTermDocs(Term term) throws IOException { seek(term); }
+
+    public int doc() { return termDocs.doc(); }
+    public int freq() { return termDocs.freq(); }
+
+    public void seek(Term term) throws IOException {
+      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      termDocs = reader!=null ? reader.termDocs(term) : null;
+    }
+
+    public void seek(TermEnum termEnum) throws IOException {
+      seek(termEnum.term());
+    }
+
+    public boolean next() throws IOException {
+      if (termDocs==null)
+        return false;
+
+      return termDocs.next();
+    }
+
+    public int read(final int[] docs, final int[] freqs) throws IOException {
+      if (termDocs==null)
+        return 0;
+
+      return termDocs.read(docs, freqs);
+    }
+
+    public boolean skipTo(int target) throws IOException {
+      if (termDocs==null)
+        return false;
+
+      return termDocs.skipTo(target);
+    }
+
+    public void close() throws IOException {
+      if (termDocs!=null)
+        termDocs.close();
+    }
+
+  }
+
+  private class ParallelTermPositions
+    extends ParallelTermDocs implements TermPositions {
+
+    public ParallelTermPositions() {}
+    public ParallelTermPositions(Term term) throws IOException { seek(term); }
+
+    public void seek(Term term) throws IOException {
+      IndexReader reader = ((IndexReader)fieldToReader.get(term.field()));
+      termDocs = reader!=null ? reader.termPositions(term) : null;
+    }
+
+    public int nextPosition() throws IOException {
+      // It is an error to call this if there is no next position, e.g. if termDocs==null
+      return ((TermPositions)termDocs).nextPosition();
+    }
+
+    public int getPayloadLength() {
+      return ((TermPositions)termDocs).getPayloadLength();
+    }
+
+    public byte[] getPayload(byte[] data, int offset) throws IOException {
+      return ((TermPositions)termDocs).getPayload(data, offset);
+    }
+
+
+    // TODO: Remove warning after API has been finalized
+    public boolean isPayloadAvailable() {
+      return ((TermPositions) termDocs).isPayloadAvailable();
+    }
+  }
+
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2860] trunk/archive-access/projects/nutchwax/ arch

[Archive-access-cvs] SF.net SVN: archive-access:[2860] trunk/archive-access/projects/nutchwax/ archive/src