From: <syn...@us...> - 2008-05-28 20:59:47
|
Revision: 2634 http://clucene.svn.sourceforge.net/clucene/?rev=2634&view=rev Author: synhershko Date: 2008-05-28 13:59:39 -0700 (Wed, 28 May 2008) Log Message: ----------- FieldInfo and FieldInfos Payloads support Modified Paths: -------------- branches/lucene2_3_2/src/CLucene/index/FieldInfos.cpp branches/lucene2_3_2/src/CLucene/index/FieldInfos.h branches/lucene2_3_2/src/CLucene/index/SegmentHeader.h Modified: branches/lucene2_3_2/src/CLucene/index/FieldInfos.cpp =================================================================== --- branches/lucene2_3_2/src/CLucene/index/FieldInfos.cpp 2008-05-28 18:53:47 UTC (rev 2633) +++ branches/lucene2_3_2/src/CLucene/index/FieldInfos.cpp 2008-05-28 20:59:39 UTC (rev 2634) @@ -26,14 +26,15 @@ const bool _storeTermVector, const bool _storeOffsetWithTermVector, const bool _storePositionWithTermVector, - const bool _omitNorms): + const bool _omitNorms, + const bool _storePayloads): name(CLStringIntern::intern(_fieldName CL_FILELINE)), isIndexed(_isIndexed), number(_fieldNumber), storeTermVector(_storeTermVector), storeOffsetWithTermVector(_storeOffsetWithTermVector), storePositionWithTermVector(_storeTermVector), - omitNorms(_omitNorms) + omitNorms(_omitNorms), storePayloads(_storePayloads) { } @@ -72,13 +73,13 @@ _CLDELETE(fields); } -void FieldInfos::add( const TCHAR* name, const bool isIndexed, const bool storeTermVector, - bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool omitNorms) { +FieldInfo* FieldInfos::add( const TCHAR* name, const bool isIndexed, const bool storeTermVector, + const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, const bool storePayloads) { FieldInfo* fi = fieldInfo(name); if (fi == NULL) { - addInternal(name, isIndexed, storeTermVector, + return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms); + storeOffsetWithTermVector, omitNorms, storePayloads); } else { if (fi->isIndexed != isIndexed) { fi->isIndexed = true; // once indexed, always index @@ -95,17 +96,22 @@ if (fi->omitNorms != omitNorms) { fi->omitNorms = false; // once norms are stored, always store } + if (fi->storePayloads != storePayloads) { + fi->storePayloads = true; + } } + return fi; } void FieldInfos::add(const TCHAR** names,const bool isIndexed, const bool storeTermVectors, - bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool omitNorms) { - int32_t i=0; - while ( names[i] != NULL ){ - add(names[i], isIndexed, storeTermVectors, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms); - ++i; - } + const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, const bool storePayloads) +{ + int32_t i=0; + while ( names[i] != NULL ){ + add(names[i], isIndexed, storeTermVectors, storePositionWithTermVector, + storeOffsetWithTermVector, omitNorms, storePayloads); + ++i; + } } int32_t FieldInfos::fieldNumber(const TCHAR* fieldName)const { @@ -155,6 +161,7 @@ if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi->omitNorms) bits |= OMIT_NORMS; + if (fi->storePayloads) bits |= STORE_PAYLOADS; output->writeString(fi->name,_tcslen(fi->name)); output->writeByte(bits); @@ -164,7 +171,7 @@ void FieldInfos::read(IndexInput* input) { int32_t size = input->readVInt(); uint8_t bits; - bool isIndexed,storeTermVector,storePositionsWithTermVector,storeOffsetWithTermVector,omitNorms; + bool isIndexed,storeTermVector,storePositionsWithTermVector,storeOffsetWithTermVector,omitNorms,storePayloads; for (int32_t i = 0; i < size; ++i){ TCHAR* name = input->readString(); //we could read name into a string buffer, but we can't be sure what the maximum field length will be. bits = input->readByte(); @@ -173,17 +180,19 @@ storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; omitNorms = (bits & OMIT_NORMS) != 0; + storePayloads = (bits & STORE_PAYLOADS) != 0; - addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms); + addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads); _CLDELETE_CARRAY(name); } } -void FieldInfos::addInternal( const TCHAR* name, const bool isIndexed, const bool storeTermVector, - bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool omitNorms) { +FieldInfo* FieldInfos::addInternal( const TCHAR* name, const bool isIndexed, const bool storeTermVector, + const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, const bool storePayloads) { FieldInfo* fi = _CLNEW FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, - storePositionWithTermVector, storeOffsetWithTermVector, omitNorms); + storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads); byNumber.push_back(fi); byName.put( fi->name, fi); + return fi; } bool FieldInfos::hasVectors() const{ @@ -193,4 +202,17 @@ } return false; } + +FieldInfos* FieldInfos::clone() +{ + FieldInfos* fis = _CLNEW FieldInfos(); + const size_t numField = byNumber.size(); + for(size_t i=0;i<numField;i++) { + FieldInfo* fi = byNumber[i]->clone(); + fis->byNumber.push_back(fi); + fis->byName.put( fi->name, fi); + } + return fis; +} + CL_NS_END Modified: branches/lucene2_3_2/src/CLucene/index/FieldInfos.h =================================================================== --- branches/lucene2_3_2/src/CLucene/index/FieldInfos.h 2008-05-28 18:53:47 UTC (rev 2633) +++ branches/lucene2_3_2/src/CLucene/index/FieldInfos.h 2008-05-28 20:59:39 UTC (rev 2634) @@ -37,6 +37,8 @@ bool omitNorms; // omit norms associated with indexed fields + bool storePayloads; // whether this field stores payloads together with term positions + //Func - Constructor // Initialises FieldInfo. // na holds the name of the field @@ -55,20 +57,26 @@ const bool storeTermVector, const bool storeOffsetWithTermVector, const bool storePositionWithTermVector, - const bool omitNorms); + const bool omitNorms, + const bool storePayloads); //Func - Destructor //Pre - true //Post - The instance has been destroyed ~FieldInfo(); + + FieldInfo* clone() { + return _CLNEW FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, + storeOffsetWithTermVector, omitNorms, storePayloads); + } }; -/** Access to the Field Info file that describes document fields and whether or -* not they are indexed. Each segment has a separate Field Info file. Objects -* of this class are thread-safe for multiple readers, but only one thread can -* be adding documents at a time, with no other reader or writer threads -* accessing this object. -*/ +/** Access to the Fieldable Info file that describes document fields and whether or + * not they are indexed. Each segment has a separate Fieldable Info file. Objects + * of this class are thread-safe for multiple readers, but only one thread can + * be adding documents at a time, with no other reader or writer threads + * accessing this object. + */ class FieldInfos :LUCENE_BASE{ private: //we now use internd field names, so we can use the voidCompare @@ -85,7 +93,8 @@ STORE_TERMVECTOR = 0x2, STORE_POSITIONS_WITH_TERMVECTOR = 0x4, STORE_OFFSET_WITH_TERMVECTOR = 0x8, - OMIT_NORMS = 0x10 + OMIT_NORMS = 0x10, + STORE_PAYLOADS = 0x20 }; FieldInfos(); @@ -127,26 +136,32 @@ bool hasVectors() const; + /** + * Returns a deep clone of this FieldInfos instance. + */ + FieldInfos* clone(); + // Adds field info for a Document. void add(const CL_NS(document)::Document* doc); // Merges in information from another FieldInfos. void add(FieldInfos* other); - - /** If the field is not yet known, adds it. If it is known, checks to make - * sure that the isIndexed flag is the same as was given previously for this - * field. If not - marks it as being indexed. Same goes for the TermVector - * parameters. - * - * @param name The name of the field - * @param isIndexed true if the field is indexed - * @param storeTermVector true if the term vector should be stored - * @param storePositionWithTermVector true if the term vector with positions should be stored - * @param storeOffsetWithTermVector true if the term vector with offsets should be stored + /** If the field is not yet known, adds it. If it is known, checks to make + * sure that the isIndexed flag is the same as was given previously for this + * field. If not - marks it as being indexed. Same goes for the TermVector + * parameters. + * + * @param name The name of the field + * @param isIndexed true if the field is indexed + * @param storeTermVector true if the term vector should be stored + * @param storePositionWithTermVector true if the term vector with positions should be stored + * @param storeOffsetWithTermVector true if the term vector with offsets should be stored + * @param omitNorms true if the norms for the indexed field should be omitted + * @param storePayloads true if payloads should be stored for this field */ - void add(const TCHAR* name, const bool isIndexed, const bool storeTermVector=false, - bool storePositionWithTermVector=false, bool storeOffsetWithTermVector=false, bool omitNorms=false); + FieldInfo* add(const TCHAR* name, const bool isIndexed, const bool storeTermVector=false, + const bool storePositionWithTermVector=false, const bool storeOffsetWithTermVector=false, const bool omitNorms=false, const bool storePayloads=false); /** * Assumes the fields are not storing term vectors @@ -157,15 +172,16 @@ * @see #add(String, boolean) */ void add(const TCHAR** names, const bool isIndexed, const bool storeTermVector=false, - bool storePositionWithTermVector=false, bool storeOffsetWithTermVector=false, bool omitNorms=false); + const bool storePositionWithTermVector=false, const bool storeOffsetWithTermVector=false, const bool omitNorms=false, const bool storePayloads=false); void write(CL_NS(store)::Directory* d, const char* name) const; void write(CL_NS(store)::IndexOutput* output) const; private: void read(CL_NS(store)::IndexInput* input); - void addInternal( const TCHAR* name,const bool isIndexed, const bool storeTermVector, - const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms); + // was void + FieldInfo* addInternal( const TCHAR* name,const bool isIndexed, const bool storeTermVector, + const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, const bool storePayloads); }; CL_NS_END Modified: branches/lucene2_3_2/src/CLucene/index/SegmentHeader.h =================================================================== --- branches/lucene2_3_2/src/CLucene/index/SegmentHeader.h 2008-05-28 18:53:47 UTC (rev 2633) +++ branches/lucene2_3_2/src/CLucene/index/SegmentHeader.h 2008-05-28 20:59:39 UTC (rev 2634) @@ -1,124 +1,124 @@ -/*------------------------------------------------------------------------------ -* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team -* -* Distributable under the terms of either the Apache License (Version 2.0) or -* the GNU Lesser General Public License, as specified in the COPYING file. -------------------------------------------------------------------------------*/ -#ifndef _lucene_index_SegmentHeader_ -#define _lucene_index_SegmentHeader_ - -#if defined(_LUCENE_PRAGMA_ONCE) -# pragma once -#endif - -#include "SegmentInfos.h" -#include "CLucene/util/BitSet.h" -#include "CLucene/util/VoidMap.h" -#include "Term.h" -#include "FieldInfos.h" -#include "FieldsReader.h" -#include "IndexReader.h" -#include "TermInfosReader.h" -#include "CompoundFile.h" -#include "CLucene/util/ThreadLocal.h" - -CL_NS_DEF(index) -class SegmentReader; - -class SegmentTermDocs:public virtual TermDocs { - - int32_t _doc; - - int32_t skipInterval; - - int64_t freqBasePointer; - int64_t proxBasePointer; - - int32_t numSkips; - int32_t skipCount; - CL_NS(store)::IndexInput* skipStream; - int32_t skipDoc; - int64_t freqPointer; - int64_t proxPointer; - int64_t skipPointer; - bool haveSkipped; - -protected: - // SegmentReader parent - const SegmentReader* parent; - CL_NS(store)::IndexInput* freqStream; - int32_t count; - int32_t df; - int32_t _freq; - CL_NS(util)::BitSet* deletedDocs; -public: - virtual ~SegmentTermDocs(); - - virtual void seek(TermEnum* termEnum); - virtual void seek(Term* term); - virtual void seek(const TermInfo* ti); - - virtual void close(); - virtual int32_t doc()const; - virtual int32_t freq()const; - - virtual bool next(); - - /** Optimized implementation. */ - virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length); - - /** Optimized implementation. */ - virtual bool skipTo(const int32_t target); - - virtual TermPositions* __asTermPositions(); - - ///\param Parent must be a segment reader - SegmentTermDocs( const SegmentReader* Parent); -protected: - virtual void skippingDoc(){} - virtual void skipProx(int64_t proxPointer){} -}; - - -class SegmentTermPositions: public SegmentTermDocs, public TermPositions { -private: - CL_NS(store)::IndexInput* proxStream; - int32_t proxCount; - int32_t position; - +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_index_SegmentHeader_ +#define _lucene_index_SegmentHeader_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SegmentInfos.h" +#include "CLucene/util/BitSet.h" +#include "CLucene/util/VoidMap.h" +#include "Term.h" +#include "FieldInfos.h" +#include "FieldsReader.h" +#include "IndexReader.h" +#include "TermInfosReader.h" +#include "CompoundFile.h" +#include "CLucene/util/ThreadLocal.h" + +CL_NS_DEF(index) +class SegmentReader; + +class SegmentTermDocs:public virtual TermDocs { + + int32_t _doc; + + int32_t skipInterval; + + int64_t freqBasePointer; + int64_t proxBasePointer; + + int32_t numSkips; + int32_t skipCount; + CL_NS(store)::IndexInput* skipStream; + int32_t skipDoc; + int64_t freqPointer; + int64_t proxPointer; + int64_t skipPointer; + bool haveSkipped; + +protected: + // SegmentReader parent + const SegmentReader* parent; + CL_NS(store)::IndexInput* freqStream; + int32_t count; + int32_t df; + int32_t _freq; + CL_NS(util)::BitSet* deletedDocs; +public: + virtual ~SegmentTermDocs(); + + virtual void seek(TermEnum* termEnum); + virtual void seek(Term* term); + virtual void seek(const TermInfo* ti); + + virtual void close(); + virtual int32_t doc()const; + virtual int32_t freq()const; + + virtual bool next(); + + /** Optimized implementation. */ + virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length); + + /** Optimized implementation. */ + virtual bool skipTo(const int32_t target); + + virtual TermPositions* __asTermPositions(); + + ///\param Parent must be a segment reader + SegmentTermDocs( const SegmentReader* Parent); +protected: + virtual void skippingDoc(){} + virtual void skipProx(int64_t proxPointer){} +}; + + +class SegmentTermPositions: public SegmentTermDocs, public TermPositions { +private: + CL_NS(store)::IndexInput* proxStream; + int32_t proxCount; + int32_t position; + // the current payload length int32_t payloadLength; // indicates whether the payload of the currend position has // been read from the proxStream yet - bool needToLoadPayload; - - int64_t lazySkipPointer; - int64_t lazySkipDocCount; - //int32_t lazySkipProxCount; - - void skipPositions( int32_t n ); - void lazySkip(); - -public: - ///\param Parent must be a segment reader - SegmentTermPositions(const SegmentReader* Parent); - ~SegmentTermPositions(); - - void seek(const TermInfo* ti); - void close(); - int32_t nextPosition(); - bool next(); - int32_t read(int32_t* docs, int32_t* freqs, int32_t length); - virtual TermDocs* __asTermDocs(); - virtual TermPositions* __asTermPositions(); - - //resolve SegmentTermDocs/TermPositions ambiguity - void seek(Term* term){ SegmentTermDocs::seek(term); } - void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); } - int32_t doc() const{ return SegmentTermDocs::doc(); } - int32_t freq() const{ return SegmentTermDocs::freq(); } - bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); } - + bool needToLoadPayload; + + int64_t lazySkipPointer; + int64_t lazySkipDocCount; + //int32_t lazySkipProxCount; + + void skipPositions( int32_t n ); + void lazySkip(); + +public: + ///\param Parent must be a segment reader + SegmentTermPositions(const SegmentReader* Parent); + ~SegmentTermPositions(); + + void seek(const TermInfo* ti); + void close(); + int32_t nextPosition(); + bool next(); + int32_t read(int32_t* docs, int32_t* freqs, int32_t length); + virtual TermDocs* __asTermDocs(); + virtual TermPositions* __asTermPositions(); + + //resolve SegmentTermDocs/TermPositions ambiguity + void seek(Term* term){ SegmentTermDocs::seek(term); } + void seek(TermEnum* termEnum){ SegmentTermDocs::seek(termEnum); } + int32_t doc() const{ return SegmentTermDocs::doc(); } + int32_t freq() const{ return SegmentTermDocs::freq(); } + bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); } + int32_t getPayloadLength() const { return payloadLength; } @@ -145,217 +145,217 @@ proxStream->readBytes(retArray, retOffset/*, payloadLength*/); needToLoadPayload = false; return retArray; - } - + } + bool isPayloadAvailable() const { return needToLoadPayload && (payloadLength > 0); - } - -protected: - void skippingDoc(); - /** Called by super.skipTo(). */ - void skipProx(int64_t proxPointer); -}; - - - - -/** -* An IndexReader responsible for reading 1 segment of an index -*/ -class SegmentReader: public IndexReader{ - /** - * The class Norm represents the normalizations for a field. - * These normalizations are read from an IndexInput in into an array of bytes called bytes - */ - class Norm :LUCENE_BASE{ - int32_t number; - int64_t normSeek; - SegmentReader* reader; - const char* segment; ///< pointer to segment name - public: - CL_NS(store)::IndexInput* in; - uint8_t* bytes; - bool dirty; - //Constructor - Norm(CL_NS(store)::IndexInput* instrm, int32_t number, SegmentReader* reader, const char* segment); - Norm(CL_NS(store)::IndexInput* instrm, int32_t number, int64_t normSeek, SegmentReader* reader, const char* segment); - //Destructor - ~Norm(); - - void reWrite(); - }; - friend class SegmentReader::Norm; - - //Holds the name of the segment that is being read - const char* segment; - - //Indicates if there are documents marked as deleted - bool deletedDocsDirty; - bool normsDirty; - bool undeleteAll; - - //Holds all norms for all fields in the segment - typedef CL_NS(util)::CLHashtable<const TCHAR*,Norm*,CL_NS(util)::Compare::TChar, CL_NS(util)::Equals::TChar> NormsType; - NormsType _norms; - - uint8_t* ones; - uint8_t* fakeNorms(); - - uint8_t hasSingleNorm; - CL_NS(store)::IndexInput* singleNormStream; - - // Compound File Reader when based on a compound file segment - CompoundFileReader* cfsReader; - ///Reads the Field Info file - FieldsReader* fieldsReader; - TermVectorsReader* termVectorsReaderOrig; - CL_NS(util)::ThreadLocal<TermVectorsReader*, - CL_NS(util)::Deletor::Object<TermVectorsReader> >termVectorsLocal; - - void initialize(SegmentInfo* si); - - /** - * Create a clone from the initial TermVectorsReader and store it in the ThreadLocal. - * @return TermVectorsReader - */ - TermVectorsReader* getTermVectorsReader(); - -protected: - ///Marks document docNum as deleted - void doDelete(const int32_t docNum); - void doUndeleteAll(); - void doCommit(); - void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value); - - // can return null if norms aren't stored - uint8_t* getNorms(const TCHAR* field); - -public: - /** - Func - Constructor. - Opens all files of a segment - .fnm -> Field Info File - Field names are stored in the field info file, with suffix .fnm. - .frq -> Frequency File - The .frq file contains the lists of documents which contain - each term, along with the frequency of the term in that document. - .prx -> Prox File - The prox file contains the lists of positions that each term occurs - at within documents. - .tis -> Term Info File - This file is sorted by Term. Terms are ordered first lexicographically - by the term's field name, and within that lexicographically by the term's text. - .del -> Deletion File - The .del file is optional, and only exists when a segment contains deletions - .f[0-9]* -> Norm File - Contains s, for each document, a byte that encodes a value that is - multiplied into the score for hits on that field: - */ - SegmentReader(SegmentInfo* si); - - SegmentReader(SegmentInfos* sis, SegmentInfo* si); - ///Destructor. - virtual ~SegmentReader(); - - ///Closes all streams to the files of a single segment - void doClose(); - - ///Checks if a segment managed by SegmentInfo si has deletions - static bool hasDeletions(const SegmentInfo* si); - bool hasDeletions() const; - bool hasNorms(const TCHAR* field) const; - - ///Returns all file names managed by this SegmentReader - void files(CL_NS(util)::AStringArrayWithDeletor& retarray); - ///Returns an enumeration of all the Terms and TermInfos in the set. - TermEnum* terms() const; - ///Returns an enumeration of terms starting at or after the named term t - TermEnum* terms(const Term* t) const; - - ///Gets the document identified by n - bool document(int32_t n, CL_NS(document)::Document* doc); - - ///Checks if the n-th document has been marked deleted - bool isDeleted(const int32_t n); - - ///Returns an unpositioned TermDocs enumerator. - TermDocs* termDocs() const; - ///Returns an unpositioned TermPositions enumerator. - TermPositions* termPositions() const; - - ///Returns the number of documents which contain the term t - int32_t docFreq(const Term* t) const; - - ///Returns the actual number of documents in the segment - int32_t numDocs(); - ///Returns the number of all the documents in the segment including the ones that have - ///been marked deleted - int32_t maxDoc() const; - - ///Returns the bytes array that holds the norms of a named field. - ///Returns fake norms if norms aren't available - uint8_t* norms(const TCHAR* field); - - ///Reads the Norms for field from disk - void norms(const TCHAR* field, uint8_t* bytes); - - ///concatenating segment with ext and x - char* SegmentName(const char* ext, const int32_t x=-1); - ///Creates a filename in buffer by concatenating segment with ext and x - void SegmentName(char* buffer,int32_t bufferLen,const char* ext, const int32_t x=-1 ); - - /** - * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption) - */ - void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray); - - static bool usesCompoundFile(SegmentInfo* si); - - /** Return a term frequency vector for the specified document and field. The - * vector returned contains term numbers and frequencies for all terms in - * the specified field of this document, if the field had storeTermVector - * flag set. If the flag was not set, the method returns null. - * @throws IOException - */ - TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field=NULL); - - /** Return an array of term frequency vectors for the specified document. - * The array contains a vector for each vectorized field in the document. - * Each vector vector contains term numbers and frequencies for all terms - * in a given vectorized field. - * If no such fields existed, the method returns null. - * @throws IOException - */ - bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result); -private: - //Open all norms files for all fields - void openNorms(CL_NS(store)::Directory* cfsDir); - //Closes all norms files - void closeNorms(); - - ///a bitVector that manages which documents have been deleted - CL_NS(util)::BitSet* deletedDocs; - ///an IndexInput to the frequency file - CL_NS(store)::IndexInput* freqStream; - ///For reading the fieldInfos file - FieldInfos* fieldInfos; - ///For reading the Term Dictionary .tis file - TermInfosReader* tis; - ///an IndexInput to the prox file - CL_NS(store)::IndexInput* proxStream;\ - - static bool hasSeparateNorms(SegmentInfo* si); - static uint8_t* createFakeNorms(int32_t size); - - //allow various classes to access the internals of this. this allows us to have - //a more tight idea of the package - friend class IndexReader; - friend class IndexWriter; - friend class SegmentTermDocs; - friend class SegmentTermPositions; - friend class MultiReader; -}; - -CL_NS_END -#endif + } + +protected: + void skippingDoc(); + /** Called by super.skipTo(). */ + void skipProx(int64_t proxPointer); +}; + + + + +/** +* An IndexReader responsible for reading 1 segment of an index +*/ +class SegmentReader: public IndexReader{ + /** + * The class Norm represents the normalizations for a field. + * These normalizations are read from an IndexInput in into an array of bytes called bytes + */ + class Norm :LUCENE_BASE{ + int32_t number; + int64_t normSeek; + SegmentReader* reader; + const char* segment; ///< pointer to segment name + public: + CL_NS(store)::IndexInput* in; + uint8_t* bytes; + bool dirty; + //Constructor + Norm(CL_NS(store)::IndexInput* instrm, int32_t number, SegmentReader* reader, const char* segment); + Norm(CL_NS(store)::IndexInput* instrm, int32_t number, int64_t normSeek, SegmentReader* reader, const char* segment); + //Destructor + ~Norm(); + + void reWrite(); + }; + friend class SegmentReader::Norm; + + //Holds the name of the segment that is being read + const char* segment; + + //Indicates if there are documents marked as deleted + bool deletedDocsDirty; + bool normsDirty; + bool undeleteAll; + + //Holds all norms for all fields in the segment + typedef CL_NS(util)::CLHashtable<const TCHAR*,Norm*,CL_NS(util)::Compare::TChar, CL_NS(util)::Equals::TChar> NormsType; + NormsType _norms; + + uint8_t* ones; + uint8_t* fakeNorms(); + + uint8_t hasSingleNorm; + CL_NS(store)::IndexInput* singleNormStream; + + // Compound File Reader when based on a compound file segment + CompoundFileReader* cfsReader; + ///Reads the Field Info file + FieldsReader* fieldsReader; + TermVectorsReader* termVectorsReaderOrig; + CL_NS(util)::ThreadLocal<TermVectorsReader*, + CL_NS(util)::Deletor::Object<TermVectorsReader> >termVectorsLocal; + + void initialize(SegmentInfo* si); + + /** + * Create a clone from the initial TermVectorsReader and store it in the ThreadLocal. + * @return TermVectorsReader + */ + TermVectorsReader* getTermVectorsReader(); + +protected: + ///Marks document docNum as deleted + void doDelete(const int32_t docNum); + void doUndeleteAll(); + void doCommit(); + void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value); + + // can return null if norms aren't stored + uint8_t* getNorms(const TCHAR* field); + +public: + /** + Func - Constructor. + Opens all files of a segment + .fnm -> Field Info File + Field names are stored in the field info file, with suffix .fnm. + .frq -> Frequency File + The .frq file contains the lists of documents which contain + each term, along with the frequency of the term in that document. + .prx -> Prox File + The prox file contains the lists of positions that each term occurs + at within documents. + .tis -> Term Info File + This file is sorted by Term. Terms are ordered first lexicographically + by the term's field name, and within that lexicographically by the term's text. + .del -> Deletion File + The .del file is optional, and only exists when a segment contains deletions + .f[0-9]* -> Norm File + Contains s, for each document, a byte that encodes a value that is + multiplied into the score for hits on that field: + */ + SegmentReader(SegmentInfo* si); + + SegmentReader(SegmentInfos* sis, SegmentInfo* si); + ///Destructor. + virtual ~SegmentReader(); + + ///Closes all streams to the files of a single segment + void doClose(); + + ///Checks if a segment managed by SegmentInfo si has deletions + static bool hasDeletions(const SegmentInfo* si); + bool hasDeletions() const; + bool hasNorms(const TCHAR* field) const; + + ///Returns all file names managed by this SegmentReader + void files(CL_NS(util)::AStringArrayWithDeletor& retarray); + ///Returns an enumeration of all the Terms and TermInfos in the set. + TermEnum* terms() const; + ///Returns an enumeration of terms starting at or after the named term t + TermEnum* terms(const Term* t) const; + + ///Gets the document identified by n + bool document(int32_t n, CL_NS(document)::Document* doc); + + ///Checks if the n-th document has been marked deleted + bool isDeleted(const int32_t n); + + ///Returns an unpositioned TermDocs enumerator. + TermDocs* termDocs() const; + ///Returns an unpositioned TermPositions enumerator. + TermPositions* termPositions() const; + + ///Returns the number of documents which contain the term t + int32_t docFreq(const Term* t) const; + + ///Returns the actual number of documents in the segment + int32_t numDocs(); + ///Returns the number of all the documents in the segment including the ones that have + ///been marked deleted + int32_t maxDoc() const; + + ///Returns the bytes array that holds the norms of a named field. + ///Returns fake norms if norms aren't available + uint8_t* norms(const TCHAR* field); + + ///Reads the Norms for field from disk + void norms(const TCHAR* field, uint8_t* bytes); + + ///concatenating segment with ext and x + char* SegmentName(const char* ext, const int32_t x=-1); + ///Creates a filename in buffer by concatenating segment with ext and x + void SegmentName(char* buffer,int32_t bufferLen,const char* ext, const int32_t x=-1 ); + + /** + * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption) + */ + void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray); + + static bool usesCompoundFile(SegmentInfo* si); + + /** Return a term frequency vector for the specified document and field. The + * vector returned contains term numbers and frequencies for all terms in + * the specified field of this document, if the field had storeTermVector + * flag set. If the flag was not set, the method returns null. + * @throws IOException + */ + TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field=NULL); + + /** Return an array of term frequency vectors for the specified document. + * The array contains a vector for each vectorized field in the document. + * Each vector vector contains term numbers and frequencies for all terms + * in a given vectorized field. + * If no such fields existed, the method returns null. + * @throws IOException + */ + bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result); +private: + //Open all norms files for all fields + void openNorms(CL_NS(store)::Directory* cfsDir); + //Closes all norms files + void closeNorms(); + + ///a bitVector that manages which documents have been deleted + CL_NS(util)::BitSet* deletedDocs; + ///an IndexInput to the frequency file + CL_NS(store)::IndexInput* freqStream; + ///For reading the fieldInfos file + FieldInfos* fieldInfos; + ///For reading the Term Dictionary .tis file + TermInfosReader* tis; + ///an IndexInput to the prox file + CL_NS(store)::IndexInput* proxStream;\ + + static bool hasSeparateNorms(SegmentInfo* si); + static uint8_t* createFakeNorms(int32_t size); + + //allow various classes to access the internals of this. this allows us to have + //a more tight idea of the package + friend class IndexReader; + friend class IndexWriter; + friend class SegmentTermDocs; + friend class SegmentTermPositions; + friend class MultiReader; +}; + +CL_NS_END +#endif This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |