From: <bi...@us...> - 2008-06-26 22:30:16
|
Revision: 2327 http://archive-access.svn.sourceforge.net/archive-access/?rev=2327&view=rev Author: binzino Date: 2008-06-26 15:30:24 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision of modified version of Lucene's ParallalReader. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/apache/lucene/index/ArchiveParallelReader.java 2008-06-26 22:30:24 UTC (rev 2327) @@ -0,0 +1,614 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * ARCHIVE: This must be in the lucene index package because it needs + * to call protected methods on other IndexReader objects. + */ +package org.apache.lucene.index; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; + +import java.io.IOException; +import java.util.*; + + +/** An IndexReader which reads multiple, parallel indexes. Each index added + * must have the same number of documents, but typically each contains + * different fields. Each document contains the union of the fields of all + * documents with the same document number. When searching, matches for a + * query term are from the first index added that has the field. + * + * <p>This is useful, e.g., with collections that have large fields which + * change rarely and small fields that change more frequently. The smaller + * fields may be re-indexed in a new index and both indexes may be searched + * together. + * + * <p><strong>Warning:</strong> It is up to you to make sure all indexes + * are created and modified the same way. For example, if you add + * documents to one index, you need to add the same documents in the + * same order to the other indexes. <em>Failure to do so will result in + * undefined behavior</em>. + */ +public class ArchiveParallelReader extends IndexReader { + private List readers = new ArrayList(); + private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close + boolean incRefReaders = false; + private SortedMap fieldToReader = new TreeMap(); + + private int maxDoc; + private int numDocs; + private boolean hasDeletions; + + /** Construct a ArchiveParallelReader. + * <p>Note that all subreaders are closed if this ArchiveParallelReader is closed.</p> + */ + public ArchiveParallelReader() throws IOException { this(true); } + + /** Construct a ArchiveParallelReader. + * @param closeSubReaders indicates whether the subreaders should be closed + * when this ArchiveParallelReader is closed + */ + public ArchiveParallelReader(boolean closeSubReaders) throws IOException { + super(); + this.incRefReaders = !closeSubReaders; + } + + /** Add an IndexReader. + * @throws IOException if there is a low-level IO error + */ + public void add(IndexReader reader) throws IOException + { + ensureOpen(); + if (readers.size() == 0) { + this.maxDoc = reader.maxDoc(); + this.numDocs = reader.numDocs(); + this.hasDeletions = reader.hasDeletions(); + } + + if (reader.maxDoc() != maxDoc) // check compatibility + throw new IllegalArgumentException + ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); + if (reader.numDocs() != numDocs) + throw new IllegalArgumentException + ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); + + Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); + Iterator i = fields.iterator(); + while (i.hasNext()) { // update fieldToReader map + String field = (String)i.next(); + if (fieldToReader.get(field) == null) + fieldToReader.put(field, reader); + } + + readers.add(reader); + + if (incRefReaders) { + reader.incRef(); + } + decrefOnClose.add(Boolean.valueOf(incRefReaders)); + } + + /** + * Tries to reopen the subreaders. + * <br> + * If one or more subreaders could be re-opened (i. e. subReader.reopen() + * returned a new instance != subReader), then a new ArchiveParallelReader instance + * is returned, otherwise this instance is returned. + * <p> + * A re-opened instance might share one or more subreaders with the old + * instance. Index modification operations result in undefined behavior + * when performed before the old instance is closed. + * (see {@link IndexReader#reopen()}). + * <p> + * If subreaders are shared, then the reference count of those + * readers is increased to ensure that the subreaders remain open + * until the last referring reader is closed. + * + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public IndexReader reopen() throws CorruptIndexException, IOException { + ensureOpen(); + + boolean reopened = false; + List newReaders = new ArrayList(); + List newDecrefOnClose = new ArrayList(); + + boolean success = false; + + try { + + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = oldReader.reopen(); + newReaders.add(newReader); + // if at least one of the subreaders was updated we remember that + // and return a new MultiReader + if (newReader != oldReader) { + reopened = true; + } + } + + if (reopened) { + ArchiveParallelReader pr = new ArchiveParallelReader(); + for (int i = 0; i < readers.size(); i++) { + IndexReader oldReader = (IndexReader) readers.get(i); + IndexReader newReader = (IndexReader) newReaders.get(i); + if (newReader == oldReader) { + newDecrefOnClose.add(Boolean.TRUE); + newReader.incRef(); + } else { + // this is a new subreader instance, so on close() we don't + // decRef but close it + newDecrefOnClose.add(Boolean.FALSE); + } + pr.add(newReader); + } + pr.decrefOnClose = newDecrefOnClose; + pr.incRefReaders = incRefReaders; + success = true; + return pr; + } else { + success = true; + // No subreader was refreshed + return this; + } + } finally { + if (!success && reopened) { + for (int i = 0; i < newReaders.size(); i++) { + IndexReader r = (IndexReader) newReaders.get(i); + if (r != null) { + try { + if (((Boolean) newDecrefOnClose.get(i)).booleanValue()) { + r.decRef(); + } else { + r.close(); + } + } catch (IOException ignore) { + // keep going - we want to clean up as much as possible + } + } + } + } + } + } + + + public int numDocs() { + // Don't call ensureOpen() here (it could affect performance) + return numDocs; + } + + public int maxDoc() { + // Don't call ensureOpen() here (it could affect performance) + return maxDoc; + } + + public boolean hasDeletions() { + // Don't call ensureOpen() here (it could affect performance) + return hasDeletions; + } + + // check first reader + public boolean isDeleted(int n) { + // Don't call ensureOpen() here (it could affect performance) + if (readers.size() > 0) + return ((IndexReader)readers.get(0)).isDeleted(n); + return false; + } + + // delete in all readers + protected void doDelete(int n) throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).deleteDocument(n); + } + hasDeletions = true; + } + + /** + * @see org.apache.lucene.index.ParallelReader.doUndeleteAll + */ + protected void doUndeleteAll() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + ((IndexReader)readers.get(i)).undeleteAll(); + } + hasDeletions = false; + } + + /** + * <p><strong>ARCHIVE</strong> modification</p> + * <p>Return a <code>Document</code> with fields merged from parallel + * indices. The values for a given field will <strong>only</strong> + * come from the first index that has the field. This matches the + * searching behavior where a field is only searched in the first + * index that has the field.</p> + * <p>This differs from the bundled Lucene <code>ParallelReader</code>, + * which adds all vales from every index that has the field.</p> + * <p>The <code>fieldSelector<code> parameter is ignored.</p> + * <h3>Implementation Notes</h3> + * <p>Since getting the document from the reader is the expensive + * operation, we only get it once from each reader. Once we've + * gotten the document from the reader, we iterate through the + * fields and only copy those fields that are mapped to the reader.</p> + * <p>The first implementation iterated through the field names, + * getting the document from the corresponding reader for each + * field name (10 fields => 10 document gets) which was a big + * performance hit.</p> + * <p>In this implementation, there are only as many document gets as + * there are readers.</p> + * @param n ordinal position of document to return + * @param fieldSelector ignored + * @return the document with field values assembled from parallel indicdes + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + public Document document(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException + { + ensureOpen(); + Document result = new Document(); + + for ( IndexReader reader : (List<IndexReader>) readers ) + { + Document d = reader.document( n ); + + for ( Fieldable f : ((List<Fieldable>) d.getFields()) ) + { + if ( fieldToReader.get( f.name( ) ) == reader ) + { + result.add( f ); + } + } + } + + return result; + } + + // get all vectors + public TermFreqVector[] getTermFreqVectors(int n) throws IOException { + ensureOpen(); + ArrayList results = new ArrayList(); + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + TermFreqVector vector = reader.getTermFreqVector(n, field); + if (vector != null) + results.add(vector); + } + return (TermFreqVector[]) + results.toArray(new TermFreqVector[results.size()]); + } + + public TermFreqVector getTermFreqVector(int n, String field) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.getTermFreqVector(n, field); + } + + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader != null) { + reader.getTermFreqVector(docNumber, field, mapper); + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + ensureOpen(); + + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + reader.getTermFreqVector(docNumber, field, mapper); + } + + } + + public boolean hasNorms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? false : reader.hasNorms(field); + } + + public byte[] norms(String field) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + return reader==null ? null : reader.norms(field); + } + + public void norms(String field, byte[] result, int offset) + throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.norms(field, result, offset); + } + + protected void doSetNorm(int n, String field, byte value) + throws CorruptIndexException, IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + reader.doSetNorm(n, field, value); + } + + public TermEnum terms() throws IOException { + ensureOpen(); + return new ParallelTermEnum(); + } + + public TermEnum terms(Term term) throws IOException { + ensureOpen(); + return new ParallelTermEnum(term); + } + + public int docFreq(Term term) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + return reader==null ? 0 : reader.docFreq(term); + } + + public TermDocs termDocs(Term term) throws IOException { + ensureOpen(); + return new ParallelTermDocs(term); + } + + public TermDocs termDocs() throws IOException { + ensureOpen(); + return new ParallelTermDocs(); + } + + public TermPositions termPositions(Term term) throws IOException { + ensureOpen(); + return new ParallelTermPositions(term); + } + + public TermPositions termPositions() throws IOException { + ensureOpen(); + return new ParallelTermPositions(); + } + + /** + * Checks recursively if all subreaders are up to date. + */ + public boolean isCurrent() throws CorruptIndexException, IOException { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isCurrent()) { + return false; + } + } + + // all subreaders are up to date + return true; + } + + /** + * Checks recursively if all subindexes are optimized + */ + public boolean isOptimized() { + for (int i = 0; i < readers.size(); i++) { + if (!((IndexReader)readers.get(i)).isOptimized()) { + return false; + } + } + + // all subindexes are optimized + return true; + } + + + /** Not implemented. + * @throws UnsupportedOperationException + */ + public long getVersion() { + throw new UnsupportedOperationException("ArchiveParallelReader does not support this method."); + } + + // for testing + IndexReader[] getSubReaders() { + return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); + } + + protected void doCommit() throws IOException { + for (int i = 0; i < readers.size(); i++) + ((IndexReader)readers.get(i)).commit(); + } + + protected synchronized void doClose() throws IOException { + for (int i = 0; i < readers.size(); i++) { + if (((Boolean) decrefOnClose.get(i)).booleanValue()) { + ((IndexReader)readers.get(i)).decRef(); + } else { + ((IndexReader)readers.get(i)).close(); + } + } + } + + public Collection getFieldNames (IndexReader.FieldOption fieldNames) { + ensureOpen(); + Set fieldSet = new HashSet(); + for (int i = 0; i < readers.size(); i++) { + IndexReader reader = ((IndexReader)readers.get(i)); + Collection names = reader.getFieldNames(fieldNames); + fieldSet.addAll(names); + } + return fieldSet; + } + + private class ParallelTermEnum extends TermEnum { + private String field; + private Iterator fieldIterator; + private TermEnum termEnum; + + public ParallelTermEnum() throws IOException { + field = (String)fieldToReader.firstKey(); + if (field != null) + termEnum = ((IndexReader)fieldToReader.get(field)).terms(); + } + + public ParallelTermEnum(Term term) throws IOException { + field = term.field(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader!=null) + termEnum = reader.terms(term); + } + + public boolean next() throws IOException { + if (termEnum==null) + return false; + + // another term in this field? + if (termEnum.next() && termEnum.term().field()==field) + return true; // yes, keep going + + termEnum.close(); // close old termEnum + + // find the next field with terms, if any + if (fieldIterator==null) { + fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); + fieldIterator.next(); // Skip field to get next one + } + while (fieldIterator.hasNext()) { + field = (String) fieldIterator.next(); + termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field, "")); + Term term = termEnum.term(); + if (term!=null && term.field()==field) + return true; + else + termEnum.close(); + } + + return false; // no more fields + } + + public Term term() { + if (termEnum==null) + return null; + + return termEnum.term(); + } + + public int docFreq() { + if (termEnum==null) + return 0; + + return termEnum.docFreq(); + } + + public void close() throws IOException { + if (termEnum!=null) + termEnum.close(); + } + + } + + // wrap a TermDocs in order to support seek(Term) + private class ParallelTermDocs implements TermDocs { + protected TermDocs termDocs; + + public ParallelTermDocs() {} + public ParallelTermDocs(Term term) throws IOException { seek(term); } + + public int doc() { return termDocs.doc(); } + public int freq() { return termDocs.freq(); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termDocs(term) : null; + } + + public void seek(TermEnum termEnum) throws IOException { + seek(termEnum.term()); + } + + public boolean next() throws IOException { + if (termDocs==null) + return false; + + return termDocs.next(); + } + + public int read(final int[] docs, final int[] freqs) throws IOException { + if (termDocs==null) + return 0; + + return termDocs.read(docs, freqs); + } + + public boolean skipTo(int target) throws IOException { + if (termDocs==null) + return false; + + return termDocs.skipTo(target); + } + + public void close() throws IOException { + if (termDocs!=null) + termDocs.close(); + } + + } + + private class ParallelTermPositions + extends ParallelTermDocs implements TermPositions { + + public ParallelTermPositions() {} + public ParallelTermPositions(Term term) throws IOException { seek(term); } + + public void seek(Term term) throws IOException { + IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); + termDocs = reader!=null ? reader.termPositions(term) : null; + } + + public int nextPosition() throws IOException { + // It is an error to call this if there is no next position, e.g. if termDocs==null + return ((TermPositions)termDocs).nextPosition(); + } + + public int getPayloadLength() { + return ((TermPositions)termDocs).getPayloadLength(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + return ((TermPositions)termDocs).getPayload(data, offset); + } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) termDocs).isPayloadAvailable(); + } + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |