|
From: <syn...@us...> - 2009-04-12 13:21:14
|
Revision: 2985
http://clucene.svn.sourceforge.net/clucene/?rev=2985&view=rev
Author: synhershko
Date: 2009-04-12 13:21:07 +0000 (Sun, 12 Apr 2009)
Log Message:
-----------
Brings 2.3.2 support for TermVectorReader and supporting classes.
Also:
* Updates ObjectArray with a better coding interface and memory management
* Makes several dependent classes to use ObjectArray/ValueArray instead of Array<>
* Updates tests to comply with new Array.h changes
Modified Paths:
--------------
branches/lucene2_3_2/src/core/CLucene/files_list.txt
branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp
branches/lucene2_3_2/src/core/CLucene/index/IndexReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/IndexReader.h
branches/lucene2_3_2/src/core/CLucene/index/MultiReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/MultiReader.h
branches/lucene2_3_2/src/core/CLucene/index/SegmentMerger.cpp
branches/lucene2_3_2/src/core/CLucene/index/SegmentReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/SegmentTermDocs.cpp
branches/lucene2_3_2/src/core/CLucene/index/SegmentTermPositions.cpp
branches/lucene2_3_2/src/core/CLucene/index/SegmentTermVector.cpp
branches/lucene2_3_2/src/core/CLucene/index/TermVector.h
branches/lucene2_3_2/src/core/CLucene/index/TermVectorReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/TermVectorWriter.cpp
branches/lucene2_3_2/src/core/CLucene/index/Terms.h
branches/lucene2_3_2/src/core/CLucene/index/_DocumentWriter.h
branches/lucene2_3_2/src/core/CLucene/index/_MultiReader.h
branches/lucene2_3_2/src/core/CLucene/index/_SegmentHeader.h
branches/lucene2_3_2/src/core/CLucene/index/_TermVector.h
branches/lucene2_3_2/src/core/CLucene/search/PhrasePositions.cpp
branches/lucene2_3_2/src/core/CLucene/search/_PhrasePositions.h
branches/lucene2_3_2/src/core/CLucene/util/Array.h
branches/lucene2_3_2/src/core/CMakeLists.txt
branches/lucene2_3_2/src/test/search/TestTermVector.cpp
Added Paths:
-----------
branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.h
branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.cpp
branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.h
Modified: branches/lucene2_3_2/src/core/CLucene/files_list.txt
===================================================================
(Binary files differ)
Added: branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.cpp (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -0,0 +1,86 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+
+#include "CLucene/_ApiHeader.h"
+#include "DefaultSkipListReader.h"
+
+CL_NS_DEF(index)
+
+DefaultSkipListReader::DefaultSkipListReader(CL_NS(store)::IndexInput* _skipStream, const int32_t maxSkipLevels, const int32_t _skipInterval)
+ : MultiLevelSkipListReader(_skipStream, maxSkipLevels, _skipInterval)
+{
+ freqPointer = _CL_NEWARRAY(int64_t,maxSkipLevels);
+ proxPointer = _CL_NEWARRAY(int64_t,maxSkipLevels);
+ payloadLength = _CL_NEWARRAY(int32_t,maxSkipLevels);
+}
+
+DefaultSkipListReader::~DefaultSkipListReader(){
+ _CLLDELETE(freqPointer);
+ _CLLDELETE(proxPointer);
+ _CLLDELETE(payloadLength);
+}
+
+void DefaultSkipListReader::init(const int64_t _skipPointer, const int64_t freqBasePointer, const int64_t proxBasePointer, const int32_t df, const bool storesPayloads) {
+ MultiLevelSkipListReader::init(_skipPointer, df);
+ this->currentFieldStoresPayloads = storesPayloads;
+ lastFreqPointer = freqBasePointer;
+ lastProxPointer = proxBasePointer;
+
+ for (int32_t j=0; j<numberOfSkipLevels; j++){
+ freqPointer[j] = freqBasePointer;
+ proxPointer[j] = proxBasePointer;
+ payloadLength[j] = 0;
+ }
+}
+
+int64_t DefaultSkipListReader::getFreqPointer() const {
+ return lastFreqPointer;
+}
+int64_t DefaultSkipListReader::getProxPointer() const {
+ return lastProxPointer;
+}
+int32_t DefaultSkipListReader::getPayloadLength() const {
+ return lastPayloadLength;
+}
+
+void DefaultSkipListReader::seekChild(const int32_t level) {
+ //super.seekChild(level);
+ freqPointer[level] = lastFreqPointer;
+ proxPointer[level] = lastProxPointer;
+ payloadLength[level] = lastPayloadLength;
+}
+
+void DefaultSkipListReader::setLastSkipData(const int32_t level) {
+ //super.setLastSkipData(level);
+ lastFreqPointer = freqPointer[level];
+ lastProxPointer = proxPointer[level];
+ lastPayloadLength = payloadLength[level];
+}
+
+int32_t DefaultSkipListReader::readSkipData(const int32_t level, CL_NS(store)::IndexInput* _skipStream) {
+ int32_t delta;
+ if (currentFieldStoresPayloads) {
+ // the current field stores payloads.
+ // if the doc delta is odd then we have
+ // to read the current payload length
+ // because it differs from the length of the
+ // previous payload
+ delta = _skipStream->readVInt();
+ if ((delta & 1) != 0) {
+ payloadLength[level] = _skipStream->readVInt();
+ }
+ delta = (int32_t)(((uint32_t)delta) >> (uint32_t)1);
+ } else {
+ delta = _skipStream->readVInt();
+ }
+ freqPointer[level] += _skipStream->readVInt();
+ proxPointer[level] += _skipStream->readVInt();
+
+ return delta;
+}
+
+CL_NS_END
\ No newline at end of file
Added: branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.h (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/index/DefaultSkipListReader.h 2009-04-12 13:21:07 UTC (rev 2985)
@@ -0,0 +1,57 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_index_DefaultSkipListReader_
+#define _lucene_index_DefaultSkipListReader_
+
+#include "MultiLevelSkipListReader.h"
+
+CL_NS_DEF(index)
+
+/**
+ * Implements the skip list reader for the default posting list format
+ * that stores positions and payloads.
+ *
+ */
+class DefaultSkipListReader: public MultiLevelSkipListReader {
+private:
+ bool currentFieldStoresPayloads;
+ int64_t* freqPointer;
+ int64_t* proxPointer;
+ int32_t* payloadLength;
+
+ int64_t lastFreqPointer;
+ int64_t lastProxPointer;
+ int32_t lastPayloadLength;
+
+public:
+ DefaultSkipListReader(CL_NS(store)::IndexInput* _skipStream, const int32_t maxSkipLevels, const int32_t _skipInterval);
+ virtual ~DefaultSkipListReader();
+
+ void init(const int64_t _skipPointer, const int64_t freqBasePointer, const int64_t proxBasePointer, const int32_t df, const bool storesPayloads);
+
+ /** Returns the freq pointer of the doc to which the last call of
+ * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
+ int64_t getFreqPointer() const;
+
+ /** Returns the prox pointer of the doc to which the last call of
+ * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */
+ int64_t getProxPointer() const;
+
+ /** Returns the payload length of the payload stored just before
+ * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
+ * has skipped. */
+ int32_t getPayloadLength() const;
+
+protected:
+ void seekChild(const int32_t level);
+
+ void setLastSkipData(const int32_t level);
+
+ int32_t readSkipData(const int32_t level, CL_NS(store)::IndexInput* _skipStream);
+};
+CL_NS_END
+#endif
Modified: branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/DocumentWriter.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -35,7 +35,7 @@
/*Posting*/
-DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset)
+DocumentWriter::Posting::Posting(Term* t, const int32_t position, TermVectorOffsetInfo* offset):offsets(NULL)
{
//Func - Constructor
//Pre - t contains a valid reference to a Term
@@ -43,14 +43,18 @@
freq = 1;
term = _CL_POINTER(t);
- positions.values = (int32_t*)malloc(sizeof(int32_t));
- positions.values[0] = position;
- positions.length = 1;
+ this->positions = _CLNEW ValueArray<int32_t>(1);
+ this->positions->values[0] = position;
+ //positions.values = (int32_t*)malloc(sizeof(int32_t));
+ //positions.values[0] = position;
+ //positions.length = 1;
if ( offset != NULL ){
- this->offsets.values = (TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));
- this->offsets.values[0] = *offset;
- this->offsets.length = 1;
+ this->offsets = _CLNEW ObjectArray<TermVectorOffsetInfo>(1);
+ this->offsets->values[0] = offset;
+ //this->offsets.values = (TermVectorOffsetInfo**)malloc(sizeof(TermVectorOffsetInfo));
+ //this->offsets.values[0] = offset;
+ //this->offsets.length = 1;
}
}
DocumentWriter::Posting::~Posting(){
@@ -58,9 +62,11 @@
//Pre - true
//Post - The instance has been destroyed
- free(this->positions.values);
- if ( this->offsets.values != NULL )
- free(this->offsets.values);
+ _CLDELETE_LARRAY(this->positions->values);
+ //_CLLDELETE(this->positions);
+ //_CLLDELETE(this->offsets);
+ if ( this->offsets != NULL )
+ _CLDELETE_LARRAY(this->offsets->values);
_CLDECDELETE(this->term);
}
@@ -376,19 +382,19 @@
Posting* ti = postingTable->get(termBuffer);
if (ti != NULL) { // word seen before
int32_t freq = ti->freq;
- if (ti->positions.length == freq) {
+ if (ti->positions->length == freq) {
// positions array is full, realloc its size
- ti->positions.length = freq*2;
- ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));
+ ti->positions->length = freq*2;
+ ti->positions->values = (int32_t*)realloc(ti->positions->values, ti->positions->length * sizeof(int32_t));
}
- ti->positions.values[freq] = position; // add new position
+ ti->positions->values[freq] = position; // add new position
if (offset != NULL) {
- if (ti->offsets.length == freq){
- ti->offsets.length = freq*2;
- ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));
+ if (ti->offsets->length == freq){
+ ti->offsets->length = freq*2;
+ ti->offsets->values = (TermVectorOffsetInfo**)realloc(ti->offsets->values, ti->offsets->length * sizeof(TermVectorOffsetInfo));
}
- ti->offsets[freq] = *offset;
+ ti->offsets->values[freq] = offset;
}
ti->freq = freq + 1; // update frequency
@@ -490,8 +496,8 @@
int32_t lastPosition = 0; // write positions
for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding
- prox->writeVInt(posting->positions.values[j] - lastPosition);
- lastPosition = posting->positions.values[j];
+ prox->writeVInt(posting->positions->values[j] - lastPosition);
+ lastPosition = posting->positions->values[j];
}
// check to see if we switched to a new field
@@ -513,7 +519,7 @@
}
}
if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
- termVectorWriter->addTerm(posting->term->text(), postingFreq, &posting->positions, &posting->offsets);
+ termVectorWriter->addTerm(posting->term->text(), postingFreq, posting->positions, posting->offsets);
}
}
if (termVectorWriter != NULL)
Modified: branches/lucene2_3_2/src/core/CLucene/index/IndexReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/IndexReader.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/IndexReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -367,7 +367,7 @@
return _termPositions;
}
- bool IndexReader::getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result){
+ bool IndexReader::getTermFreqVectors(int32_t docNumber, ObjectArray<TermFreqVector>& result){
return this->getTermFreqVectors(docNumber, result);
}
Modified: branches/lucene2_3_2/src/core/CLucene/index/IndexReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/IndexReader.h 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/IndexReader.h 2009-04-12 13:21:07 UTC (rev 2985)
@@ -260,7 +260,7 @@
* @throws IOException if index cannot be accessed
* @see org.apache.lucene.document.Field.TermVector
*/
- virtual bool getTermFreqVectors(int32_t docNumber, CL_NS(util)::Array<TermFreqVector*>& result) =0;
+ virtual bool getTermFreqVectors(int32_t docNumber, CL_NS(util)::ObjectArray<TermFreqVector>& result) =0;
/**
* Return a term frequency vector for the specified document and field. The
Added: branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.cpp (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -0,0 +1,227 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "MultiLevelSkipListReader.h"
+
+CL_NS_USE(store)
+CL_NS_DEF(index)
+
+MultiLevelSkipListReader::MultiLevelSkipListReader(IndexInput* _skipStream, const int32_t maxSkipLevels,
+ const int32_t _skipInterval):
+ numberOfLevelsToBuffer(1),skipStream(NULL),skipPointer(NULL),skipInterval(NULL),
+ numSkipped(NULL),skipDoc(_CL_NEWARRAY(int32_t,maxSkipLevels)),childPointer(NULL)
+{
+ this->skipStream = _CL_NEWARRAY(IndexInput*,maxSkipLevels);
+ this->skipPointer = _CL_NEWARRAY(int64_t,maxSkipLevels);
+ this->childPointer = _CL_NEWARRAY(int64_t,maxSkipLevels);
+ this->numSkipped = _CL_NEWARRAY(int32_t,maxSkipLevels);
+ this->maxNumberOfSkipLevels = maxSkipLevels;
+ this->skipInterval = _CL_NEWARRAY(int32_t,maxSkipLevels);
+ this->skipStream[0] = _skipStream;
+ this->inputIsBuffered = (strcmp(_skipStream->getObjectName(),"BufferedIndexInput") == 0);
+ this->skipInterval[0] = _skipInterval;
+ for (int32_t i = 1; i < maxSkipLevels; i++) {
+ // cache skip intervals
+ this->skipInterval[i] = this->skipInterval[i - 1] * _skipInterval;
+ }
+ memset(skipDoc,0,maxSkipLevels*sizeof(int32_t)); // TODO: artificial init
+}
+MultiLevelSkipListReader::~MultiLevelSkipListReader(){
+ close();
+ _CLDELETE_LARRAY(skipStream);
+ _CLDELETE_LARRAY(skipPointer);
+ _CLDELETE_LARRAY(childPointer);
+ _CLDELETE_LARRAY(numSkipped);
+ _CLDELETE_LARRAY(skipInterval);
+ _CLDELETE_LARRAY(skipDoc);
+}
+
+int32_t MultiLevelSkipListReader::getDoc() const {
+ return lastDoc;
+}
+
+int32_t MultiLevelSkipListReader::skipTo(const int32_t target) {
+ if (!haveSkipped) {
+ // first time, load skip levels
+ loadSkipLevels();
+ haveSkipped = true;
+ }
+
+ // walk up the levels until highest level is found that has a skip
+ // for this target
+ int32_t level = 0;
+ while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) {
+ level++;
+ }
+
+ while (level >= 0) {
+ if (target > skipDoc[level]) {
+ if (!loadNextSkip(level)) {
+ continue;
+ }
+ } else {
+ // no more skips on this level, go down one level
+ if (level > 0 && lastChildPointer > skipStream[level - 1]->getFilePointer()) {
+ seekChild(level - 1);
+ }
+ level--;
+ }
+ }
+
+ return numSkipped[0] - skipInterval[0] - 1;
+}
+
+bool MultiLevelSkipListReader::loadNextSkip(const int32_t level) {
+ // we have to skip, the target document is greater than the current
+ // skip list entry
+ setLastSkipData(level);
+
+ numSkipped[level] += skipInterval[level];
+
+ if (numSkipped[level] > docCount) {
+ // this skip list is exhausted
+ skipDoc[level] = LUCENE_INT32_MAX_SHOULDBE;
+ if (numberOfSkipLevels > level) numberOfSkipLevels = level;
+ return false;
+ }
+
+ // read next skip entry
+ skipDoc[level] += readSkipData(level, skipStream[level]);
+
+ if (level != 0) {
+ // read the child pointer if we are not on the leaf level
+ childPointer[level] = skipStream[level]->readVLong() + skipPointer[level - 1];
+ }
+ return true;
+}
+
+void MultiLevelSkipListReader::seekChild(const int32_t level) {
+ skipStream[level]->seek(lastChildPointer);
+ numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
+ skipDoc[level] = lastDoc;
+ if (level > 0) {
+ childPointer[level] = skipStream[level]->readVLong() + skipPointer[level - 1];
+ }
+}
+
+void MultiLevelSkipListReader::close() {
+ for (int32_t i = 1; i < maxNumberOfSkipLevels; i++) {
+ if (skipStream[i] != NULL) {
+ //skipStream[i]->close();
+ _CLLDELETE(skipStream[i]);
+ }
+ }
+}
+
+void MultiLevelSkipListReader::init(const int64_t _skipPointer, const int32_t df) {
+ this->skipPointer[0] = _skipPointer;
+ this->docCount = df;
+ for (int32_t j=0; j<numberOfSkipLevels; j++){
+ skipDoc[j] = 0;
+ numSkipped[j] = 0;
+ childPointer[j] = 0;
+ }
+
+ haveSkipped = false;
+ for (int32_t i = 1; i < numberOfSkipLevels; i++) {
+ _CLDELETE(skipStream[i]);
+ }
+}
+
+void MultiLevelSkipListReader::loadSkipLevels() {
+ numberOfSkipLevels = (docCount == 0) ? 0 : (int32_t)floor(log((double)docCount) / log((double)skipInterval[0]));
+ if (numberOfSkipLevels > maxNumberOfSkipLevels) {
+ numberOfSkipLevels = maxNumberOfSkipLevels;
+ }
+
+ skipStream[0]->seek(skipPointer[0]);
+
+ int32_t toBuffer = numberOfLevelsToBuffer;
+
+ for (int32_t i = numberOfSkipLevels - 1; i > 0; i--) {
+ // the length of the current level
+ int64_t length = skipStream[0]->readVLong();
+
+ // the start pointer of the current level
+ skipPointer[i] = skipStream[0]->getFilePointer();
+ if (toBuffer > 0) {
+ // buffer this level
+ skipStream[i] = static_cast<IndexInput*>(_CLNEW SkipBuffer(skipStream[0], (int32_t) length));
+ toBuffer--;
+ } else {
+ // clone this stream, it is already at the start of the current level
+ skipStream[i] = (IndexInput*) skipStream[0]->clone();
+ if (inputIsBuffered && length < BufferedIndexInput::BUFFER_SIZE) {
+ ((BufferedIndexInput*) skipStream[i])->setBufferSize((int32_t) length);
+ }
+
+ // move base stream beyond the current level
+ skipStream[0]->seek(skipStream[0]->getFilePointer() + length);
+ }
+ }
+
+ // use base stream for the lowest level
+ skipPointer[0] = skipStream[0]->getFilePointer();
+}
+
+void MultiLevelSkipListReader::setLastSkipData(const int32_t level) {
+ lastDoc = skipDoc[level];
+ lastChildPointer = childPointer[level];
+}
+
+MultiLevelSkipListReader::SkipBuffer::SkipBuffer(IndexInput* input, const int32_t _length):pos(0)
+{
+ data = _CL_NEWARRAY(uint8_t,_length);
+ this->_datalength = _length;
+ pointer = input->getFilePointer();
+ input->readBytes(data, _length);
+}
+MultiLevelSkipListReader::SkipBuffer::~SkipBuffer()
+{
+ _CLLDELETE(data);
+}
+
+void MultiLevelSkipListReader::SkipBuffer::close() {
+ _CLDELETE(data);
+ _datalength=0;
+}
+
+int64_t MultiLevelSkipListReader::SkipBuffer::getFilePointer() const {
+ return pointer + pos;
+}
+
+int64_t MultiLevelSkipListReader::SkipBuffer::length() const {
+ return _datalength;
+}
+
+uint8_t MultiLevelSkipListReader::SkipBuffer::readByte() {
+ return data[pos++];
+}
+
+void MultiLevelSkipListReader::SkipBuffer::readBytes(uint8_t* b, const int32_t len) {
+ memcpy(b,data+pos,len);
+ pos += len;
+}
+
+void MultiLevelSkipListReader::SkipBuffer::seek(const int64_t _pos) {
+ this->pos = (int32_t) (_pos - pointer);
+}
+
+const char* MultiLevelSkipListReader::SkipBuffer::getObjectName(){ return "SkipBuffer"; }
+const char* MultiLevelSkipListReader::SkipBuffer::getDirectoryType() const{ return "SKIP"; }
+MultiLevelSkipListReader::SkipBuffer::SkipBuffer(const SkipBuffer& other): IndexInput(other){
+ data = _CL_NEWARRAY(uint8_t,other._datalength);
+ memcpy(data,other.data,other._datalength * sizeof(uint8_t));
+ this->_datalength = other._datalength;
+ this->pointer = other.pointer;
+ this->pos = other.pos;
+}
+IndexInput* MultiLevelSkipListReader::SkipBuffer::clone() const{
+ return _CLNEW SkipBuffer(*this);
+}
+
+CL_NS_END
\ No newline at end of file
Added: branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.h (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/index/MultiLevelSkipListReader.h 2009-04-12 13:21:07 UTC (rev 2985)
@@ -0,0 +1,132 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_index_MultiLevelSkipListReader_
+#define _lucene_index_MultiLevelSkipListReader_
+
+#include "CLucene/store/IndexInput.h"
+//#include "CLucene/util/Array.h"
+
+CL_NS_DEF(index)
+
+/**
+ * This abstract class reads skip lists with multiple levels.
+ *
+ * See {@link MultiLevelSkipListWriter} for the information about the encoding
+ * of the multi level skip lists.
+ *
+ * Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)}
+ * which defines the actual format of the skip data.
+ */
+class MultiLevelSkipListReader : LUCENE_BASE {
+private:
+ // the maximum number of skip levels possible for this index
+ int32_t maxNumberOfSkipLevels;
+
+protected:
+ // number of levels in this skip list
+ int32_t numberOfSkipLevels;
+
+private:
+ // Expert: defines the number of top skip levels to buffer in memory.
+ // Reducing this number results in less memory usage, but possibly
+ // slower performance due to more random I/Os.
+ // Please notice that the space each level occupies is limited by
+ // the skipInterval. The top level can not contain more than
+ // skipLevel entries, the second top level can not contain more
+ // than skipLevel^2 entries and so forth.
+ int32_t numberOfLevelsToBuffer;
+
+ int32_t docCount;
+ bool haveSkipped;
+
+ CL_NS(store)::IndexInput** skipStream; // skipStream for each level
+ int64_t* skipPointer; // the start pointer of each skip level
+ int32_t* skipInterval; // skipInterval of each level
+ int32_t* numSkipped; // number of docs skipped per level
+
+ int32_t* skipDoc; // doc id of current skip entry per level
+ int32_t lastDoc; // doc id of last read skip entry with docId <= target
+ int64_t* childPointer; // child pointer of current skip entry per level
+ int64_t lastChildPointer; // childPointer of last read skip entry with docId <= target
+
+ bool inputIsBuffered;
+
+public:
+ MultiLevelSkipListReader(CL_NS(store)::IndexInput* _skipStream, const int32_t maxSkipLevels, const int32_t _skipInterval);
+ virtual ~MultiLevelSkipListReader();
+
+ /** Returns the id of the doc to which the last call of {@link #skipTo(int)}
+ * has skipped. */
+ int32_t getDoc() const;
+
+ /** Skips entries to the first beyond the current whose document number is
+ * greater than or equal to <i>target</i>. Returns the current doc count.
+ */
+ int32_t skipTo(const int32_t target);
+
+private:
+ bool loadNextSkip(const int32_t level);
+
+protected:
+ /** Seeks the skip entry on the given level */
+ virtual void seekChild(const int32_t level);
+
+ void close();
+
+ /** initializes the reader */
+ void init(const int64_t _skipPointer, const int32_t df);
+
+private:
+ /** Loads the skip levels */
+ void loadSkipLevels();
+
+protected:
+ /**
+ * Subclasses must implement the actual skip data encoding in this method.
+ *
+ * @param level the level skip data shall be read from
+ * @param skipStream the skip stream to read from
+ */
+ virtual int32_t readSkipData(const int32_t level, CL_NS(store)::IndexInput* skipStream) = 0;
+
+ /** Copies the values of the last read skip entry on this level */
+ virtual void setLastSkipData(const int32_t level);
+
+protected:
+ /** used to buffer the top skip levels */
+ class SkipBuffer : public CL_NS(store)::IndexInput {
+ private:
+ uint8_t* data;
+ int64_t pointer;
+ int32_t pos;
+ size_t _datalength;
+
+ public:
+ SkipBuffer(CL_NS(store)::IndexInput* input, const int32_t length);
+ virtual ~SkipBuffer();
+
+ private:
+ void close();
+
+ int64_t getFilePointer() const;
+
+ int64_t length() const;
+
+ uint8_t readByte();
+
+ void readBytes(uint8_t* b, const int32_t len);
+
+ void seek(const int64_t _pos);
+
+ SkipBuffer(const SkipBuffer& other);
+ CL_NS(store)::IndexInput* clone() const;
+ const char* getObjectName();
+ const char* getDirectoryType() const;
+ };
+};
+CL_NS_END
+#endif
Modified: branches/lucene2_3_2/src/core/CLucene/index/MultiReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/MultiReader.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/MultiReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -103,7 +103,7 @@
_CLDELETE(internal);
}
-bool MultiReader::getTermFreqVectors(int32_t n, Array<TermFreqVector*>& result){
+bool MultiReader::getTermFreqVectors(int32_t n, ObjectArray<TermFreqVector>& result){
int32_t i = readerIndex(n); // find segment num
return subReaders[i]->getTermFreqVectors(n - starts[i], result); // dispatch to segment
}
Modified: branches/lucene2_3_2/src/core/CLucene/index/MultiReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/MultiReader.h 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/MultiReader.h 2009-04-12 13:21:07 UTC (rev 2985)
@@ -58,7 +58,7 @@
* in a given vectorized field.
* If no such fields existed, the method returns null.
*/
- bool getTermFreqVectors(int32_t n, CL_NS(util)::Array<TermFreqVector*>& result);
+ bool getTermFreqVectors(int32_t n, CL_NS(util)::ObjectArray<TermFreqVector>& result);
TermFreqVector* getTermFreqVector(int32_t n, const TCHAR* field);
Modified: branches/lucene2_3_2/src/core/CLucene/index/SegmentMerger.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/SegmentMerger.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/SegmentMerger.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -307,9 +307,9 @@
if (reader->isDeleted(docNum))
continue;
- ObjectArray<TermFreqVector*> tmp;
- if ( reader->getTermFreqVectors(docNum, (Array<TermFreqVector*>&)tmp) )
- termVectorsWriter->addAllDocVectors((Array<TermFreqVector*>&)tmp);
+ ObjectArray<TermFreqVector> tmp;
+ if ( reader->getTermFreqVectors(docNum, (ObjectArray<TermFreqVector>&)tmp) )
+ termVectorsWriter->addAllDocVectors((ObjectArray<TermFreqVector>&)tmp);
tmp.deleteValues();
}
}
Modified: branches/lucene2_3_2/src/core/CLucene/index/SegmentReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/SegmentReader.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/SegmentReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -817,7 +817,7 @@
return termVectorsReader->get(docNumber, field);
}
- bool SegmentReader::getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result) {
+ bool SegmentReader::getTermFreqVectors(int32_t docNumber, ObjectArray<TermFreqVector>& result) {
if (termVectorsReaderOrig == NULL)
return false;
@@ -825,7 +825,8 @@
if (termVectorsReader == NULL)
return false;
- return termVectorsReader->get(docNumber, result);
+ result = (*termVectorsReader->get(docNumber));
+ return true;
}
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/index/SegmentTermDocs.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/SegmentTermDocs.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/SegmentTermDocs.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -13,40 +13,15 @@
CL_NS_DEF(index)
- SegmentTermDocs::SegmentTermDocs(const SegmentReader* _parent){
- //Func - Constructor
- //Pre - Paren != NULL
- //Post - The instance has been created
-
- CND_PRECONDITION(_parent != NULL,"Parent is NULL");
-
- parent = _parent;
- deletedDocs = parent->deletedDocs;
-
- _doc = 0;
- _freq = 0;
- count = 0;
- df = 0;
-
- skipInterval=0;
- numSkips=0;
- skipCount=0;
- skipStream=NULL;
- skipDoc=0;
- freqPointer=0;
- proxPointer=0;
- skipPointer=0;
- haveSkipped=false;
-
- freqStream = parent->freqStream->clone();
- skipInterval = parent->tis->getSkipInterval();
+ SegmentTermDocs::SegmentTermDocs(const SegmentReader* _parent) : parent(_parent),freqStream(_parent->freqStream->clone()),
+ count(0),df(0),deletedDocs(_parent->deletedDocs),_doc(0),_freq(0),skipInterval(_parent->tis->getSkipInterval()),
+ maxSkipLevels(_parent->tis->getMaxSkipLevels()),skipListReader(NULL),freqBasePointer(0),proxBasePointer(0),
+ skipPointer(0),haveSkipped(false)
+ {
+ CND_CONDITION(_parent != NULL,"Parent is NULL");
}
SegmentTermDocs::~SegmentTermDocs() {
- //Func - Destructor
- //Pre - true
- //Post - The instance has been destroyed
-
close();
}
@@ -56,52 +31,47 @@
void SegmentTermDocs::seek(Term* term) {
TermInfo* ti = parent->tis->get(term);
- seek(ti);
+ seek(ti, term);
_CLDELETE(ti);
}
void SegmentTermDocs::seek(TermEnum* termEnum){
TermInfo* ti=NULL;
+ Term* term = NULL;
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
if ( termEnum->getObjectName() == SegmentTermEnum::getClassName() && ((SegmentTermEnum*)termEnum)->fieldInfos == parent->fieldInfos ){
- ti = ((SegmentTermEnum*)termEnum)->getTermInfo();
- }else{
- ti = parent->tis->get(termEnum->term(false));
+ SegmentTermEnum* segmentTermEnum = ((SegmentTermEnum*) termEnum);
+ term = segmentTermEnum->term(false);
+ ti = segmentTermEnum->getTermInfo();
+ }else{
+ term = termEnum->term(false);
+ ti = parent->tis->get(term);
}
- seek(ti);
+ seek(ti,term);
_CLDELETE(ti);
}
- void SegmentTermDocs::seek(const TermInfo* ti) {
- count = 0;
- if (ti == NULL) {
- df = 0;
- } else {
- df = ti->docFreq;
- _doc = 0;
- skipDoc = 0;
- skipCount = 0;
- numSkips = df / skipInterval;
- freqPointer = ti->freqPointer;
- proxPointer = ti->proxPointer;
- skipPointer = freqPointer + ti->skipOffset;
- freqStream->seek(freqPointer);
- haveSkipped = false;
- }
+ void SegmentTermDocs::seek(const TermInfo* ti,Term* term) {
+ count = 0;
+ FieldInfo* fi = parent->fieldInfos->fieldInfo(term->field());
+ currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false;
+ if (ti == NULL) {
+ df = 0;
+ } else { // punt case
+ df = ti->docFreq;
+ _doc = 0;
+ freqBasePointer = ti->freqPointer;
+ proxBasePointer = ti->proxPointer;
+ skipPointer = freqBasePointer + ti->skipOffset;
+ freqStream->seek(freqBasePointer);
+ haveSkipped = false;
+ }
}
void SegmentTermDocs::close() {
-
- //Check if freqStream still exists
- if (freqStream != NULL){
- freqStream->close(); //todo: items like these can probably be delete, because deleting the object also closes it...do everywhere
- _CLDELETE( freqStream );
- }
- if (skipStream != NULL){
- skipStream->close();
- _CLDELETE( skipStream );
- }
+ _CLDELETE( freqStream );
+ _CLDELETE( skipListReader );
}
int32_t SegmentTermDocs::doc()const {
@@ -132,76 +102,51 @@
}
int32_t SegmentTermDocs::read(int32_t* docs, int32_t* freqs, int32_t length) {
- int32_t i = 0;
-//todo: one optimization would be to get the pointer buffer for ram or mmap dirs
-//and iterate over them instead of using readByte() intensive functions.
- while (i<length && count < df) {
- uint32_t docCode = freqStream->readVInt();
- _doc += docCode >> 1;
- if ((docCode & 1) != 0) // if low bit is set
- _freq = 1; // _freq is one
- else
- _freq = freqStream->readVInt(); // else read _freq
- count++;
+ int32_t i = 0;
+ //todo: one optimization would be to get the pointer buffer for ram or mmap dirs
+ //and iterate over them instead of using readByte() intensive functions.
+ while (i<length && count < df) {
+ // manually inlined call to next() for speed
+ uint32_t docCode = freqStream->readVInt();
+ _doc += docCode >> 1;
+ if ((docCode & 1) != 0) // if low bit is set
+ _freq = 1; // _freq is one
+ else
+ _freq = freqStream->readVInt(); // else read _freq
+ count++;
- if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) {
- docs[i] = _doc;
- freqs[i] = _freq;
- i++;
- }
- }
- return i;
+ if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) {
+ docs[i] = _doc;
+ freqs[i] = _freq;
+ i++;
+ }
+ }
+ return i;
}
bool SegmentTermDocs::skipTo(const int32_t target){
assert(count <= df );
if (df >= skipInterval) { // optimized case
- if (skipStream == NULL)
- skipStream = freqStream->clone(); // lazily clone
+ if (skipListReader == NULL)
+ skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), maxSkipLevels, skipInterval); // lazily clone
- if (!haveSkipped) { // lazily seek skip stream
- skipStream->seek(skipPointer);
- haveSkipped = true;
- }
+ if (!haveSkipped) { // lazily initialize skip stream
+ skipListReader->init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
+ haveSkipped = true;
+ }
- // scan skip data
- int32_t lastSkipDoc = skipDoc;
- int64_t lastFreqPointer = freqStream->getFilePointer();
- int64_t lastProxPointer = -1;
- int32_t numSkipped = -1 - (count % skipInterval);
+ int32_t newCount = skipListReader->skipTo(target);
+ if (newCount > count) {
+ freqStream->seek(skipListReader->getFreqPointer());
+ skipProx(skipListReader->getProxPointer(), skipListReader->getPayloadLength());
+
+ _doc = skipListReader->getDoc();
+ count = newCount;
+ }
+ }
- while (target > skipDoc) {
- lastSkipDoc = skipDoc;
- lastFreqPointer = freqPointer;
- lastProxPointer = proxPointer;
-
- if (skipDoc != 0 && skipDoc >= _doc)
- numSkipped += skipInterval;
-
- if(skipCount >= numSkips)
- break;
-
- skipDoc += skipStream->readVInt();
- freqPointer += skipStream->readVInt();
- proxPointer += skipStream->readVInt();
-
- skipCount++;
- }
-
- // if we found something to skip, then skip it
- if (lastFreqPointer > freqStream->getFilePointer()) {
- freqStream->seek(lastFreqPointer);
- skipProx(lastProxPointer);
-
- _doc = lastSkipDoc;
- count += numSkipped;
- }
-
- }
-
// done skipping, now just scan
-
do {
if (!next())
return false;
Modified: branches/lucene2_3_2/src/core/CLucene/index/SegmentTermPositions.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/SegmentTermPositions.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/SegmentTermPositions.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -13,25 +13,13 @@
CL_NS_DEF(index)
SegmentTermPositions::SegmentTermPositions(const SegmentReader* _parent):
- SegmentTermDocs(_parent){
-//Func - Constructor
-//Pre - Parent != NULL
-//Post - The instance has been created
-
- CND_PRECONDITION(_parent != NULL, "Parent is NULL");
-
- proxStream = _parent->proxStream->clone();
-
- CND_CONDITION(proxStream != NULL,"proxStream is NULL");
-
- position = 0;
- proxCount = 0;
+ SegmentTermDocs(_parent), proxStream(NULL)// the proxStream will be cloned lazily when nextPosition() is called for the first time
+ ,lazySkipPointer(-1), lazySkipProxCount(0)
+{
+ CND_CONDITION(_parent != NULL, "Parent is NULL");
}
SegmentTermPositions::~SegmentTermPositions() {
-//Func - Destructor
-//Pre - true
-//Post - The intance has been closed
close();
}
@@ -42,43 +30,61 @@
return (TermPositions*) this;
}
-void SegmentTermPositions::seek(const TermInfo* ti) {
- SegmentTermDocs::seek(ti);
+void SegmentTermPositions::seek(const TermInfo* ti, Term* term) {
+ SegmentTermDocs::seek(ti, term);
if (ti != NULL)
- //lazySkipPointer = ti->proxPointer;
- proxStream->seek(ti->proxPointer);
+ lazySkipPointer = ti->proxPointer;
- //lazySkipDocCount = 0;
+ lazySkipProxCount = 0;
proxCount = 0;
+ payloadLength = 0;
+ needToLoadPayload = false;
}
void SegmentTermPositions::close() {
-//Func - Frees the resources
-//Pre - true
-//Post - The resources have been freed
-
SegmentTermDocs::close();
//Check if proxStream still exists
if(proxStream){
- proxStream->close();
+ proxStream->close();
_CLDELETE( proxStream );
}
}
int32_t SegmentTermPositions::nextPosition() {
- /* DSR:CL_BUG: Should raise exception if proxCount == 0 at the
+ /* todo: DSR:CL_BUG: Should raise exception if proxCount == 0 at the
** beginning of this method, as in
** if (--proxCount == 0) throw ...;
** The JavaDocs for TermPositions.nextPosition declare this constraint,
** but CLucene doesn't enforce it. */
- //lazySkip();
+ lazySkip();
proxCount--;
- return position += proxStream->readVInt();
+ return position += readDeltaPosition();
}
+int32_t SegmentTermPositions::readDeltaPosition() {
+ int32_t delta = proxStream->readVInt();
+ if (currentFieldStoresPayloads) {
+ // if the current field stores payloads then
+ // the position delta is shifted one bit to the left.
+ // if the LSB is set, then we have to read the current
+ // payload length
+ if ((delta & 1) != 0) {
+ payloadLength = proxStream->readVInt();
+ }
+ delta = (int32_t)((uint32_t)delta >> (uint32_t)1);
+ needToLoadPayload = true;
+ }
+ return delta;
+}
+
+void SegmentTermPositions::skippingDoc() {
+ lazySkipProxCount += _freq;
+}
+
bool SegmentTermPositions::next() {
- for (int32_t f = proxCount; f > 0; f--) // skip unread positions
- proxStream->readVInt();
+ // we remember to skip the remaining positions of the current
+ // document lazily
+ lazySkipProxCount += proxCount;
if (SegmentTermDocs::next()) { // run super
proxCount = _freq; // note frequency
@@ -89,35 +95,78 @@
}
int32_t SegmentTermPositions::read(int32_t* docs, int32_t* freqs, int32_t length) {
- _CLTHROWA(CL_ERR_InvalidState,"TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
+ _CLTHROWA(CL_ERR_UnsupportedOperation,"TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
}
-void SegmentTermPositions::skippingDoc() {
- for (int32_t f = _freq; f > 0; f--) // skip all positions
- proxStream->readVInt();
-// lazySkipDocCount += _freq;
+void SegmentTermPositions::skipProx(const int64_t proxPointer, const int32_t _payloadLength){
+ // we save the pointer, we might have to skip there lazily
+ lazySkipPointer = proxPointer;
+ lazySkipProxCount = 0;
+ proxCount = 0;
+ this->payloadLength = _payloadLength;
+ needToLoadPayload = false;
}
-void SegmentTermPositions::skipProx(int64_t proxPointer){
- proxStream->seek(proxPointer);
-// lazySkipPointer = proxPointer;
-// lazySkipDocCount = 0;
- proxCount = 0;
+void SegmentTermPositions::skipPositions(int32_t n) {
+ for ( int32_t f = n; f > 0; f-- ) { // skip unread positions
+ readDeltaPosition();
+ skipPayload();
+ }
}
-void SegmentTermPositions::skipPositions(int32_t n) {
- for ( int32_t f = n; f > 0; f-- )
- proxStream->readVInt();
+void SegmentTermPositions::skipPayload() {
+ if (needToLoadPayload && payloadLength > 0) {
+ proxStream->seek(proxStream->getFilePointer() + payloadLength);
+ }
+ needToLoadPayload = false;
}
void SegmentTermPositions::lazySkip() {
- if ( lazySkipPointer != 0 ) {
- proxStream->seek( lazySkipPointer );
- lazySkipPointer = 0;
+ if (proxStream == NULL) {
+ // clone lazily
+ proxStream = parent->proxStream->clone();
+ }
+
+ // we might have to skip the current payload
+ // if it was not read yet
+ skipPayload();
+
+ if (lazySkipPointer != -1) {
+ proxStream->seek(lazySkipPointer);
+ lazySkipPointer = -1;
+ }
+
+ if (lazySkipProxCount != 0) {
+ skipPositions(lazySkipProxCount);
+ lazySkipProxCount = 0;
+ }
+}
+
+int32_t SegmentTermPositions::getPayloadLength() const { return payloadLength; }
+
+uint8_t* SegmentTermPositions::getPayload(uint8_t* data, const int32_t offset) {
+ if (!needToLoadPayload) {
+ _CLTHROWA(CL_ERR_IO, "Payload cannot be loaded more than once for the same term position.");
}
- if ( lazySkipDocCount != 0 ) {
- skipPositions( lazySkipDocCount );
- lazySkipDocCount = 0;
+
+ // read payloads lazily
+ uint8_t* retArray;
+ int32_t retOffset;
+ // TODO: Complete length logic ( possibly using ValueArray ? )
+ if (data == NULL /*|| data.length - offset < payloadLength*/) {
+ // the array is too small to store the payload data,
+ // so we allocate a new one
+ _CLDELETE_ARRAY(data);
+ retArray = _CL_NEWARRAY(uint8_t, payloadLength);
+ retOffset = 0;
+ } else {
+ retArray = data;
+ retOffset = offset;
}
-}
+ proxStream->readBytes(retArray + retOffset, payloadLength);
+ needToLoadPayload = false;
+ return retArray;
+}
+bool SegmentTermPositions::isPayloadAvailable() const { return needToLoadPayload && (payloadLength > 0); }
+
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/index/SegmentTermVector.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/SegmentTermVector.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/SegmentTermVector.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -13,20 +13,18 @@
CL_NS_USE(util)
CL_NS_DEF(index)
-Array<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS;
+ValueArray<int32_t> SegmentTermPositionVector::EMPTY_TERM_POS;
-SegmentTermVector::SegmentTermVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs) {
- this->field = STRDUP_TtoT(field);
- this->terms = terms;
- this->termsLen = -1; //lazily get the size of the terms
- this->termFreqs = termFreqs;
+SegmentTermVector::SegmentTermVector(const TCHAR* _field, TCHAR** _terms, ValueArray<int32_t>* _termFreqs) {
+ this->field = STRDUP_TtoT(_field); // TODO: Try and avoid this dup (using intern'ing perhaps?)
+ this->terms = _terms;
+ this->termsLen = -1; //lazily get the size of the terms array
+ this->termFreqs = _termFreqs;
}
SegmentTermVector::~SegmentTermVector(){
- _CLDELETE_CARRAY(field);
- _CLDELETE_CARRAY_ALL(terms);
-
- _CLDELETE_ARRAY(termFreqs->values);
+ _CLDELETE_LCARRAY(field);
+ _CLDELETE_LCARRAY_ALL(terms);
_CLDELETE(termFreqs);
}
TermPositionVector* SegmentTermVector::__asTermPositionVector(){
@@ -34,45 +32,45 @@
}
const TCHAR* SegmentTermVector::getField() {
-return field;
+ return field;
}
TCHAR* SegmentTermVector::toString() const{
-StringBuffer sb;
-sb.appendChar('{');
-sb.append(field);
-sb.append(_T(": "));
+ StringBuffer sb;
+ sb.appendChar('{');
+ sb.append(field);
+ sb.append(_T(": "));
-int32_t i=0;
-while ( terms && terms[i] != NULL ){
- if (i>0)
- sb.append(_T(", "));
- sb.append(terms[i]);
- sb.appendChar('/');
+ int32_t i=0;
+ while ( terms && terms[i] != NULL ){
+ if (i>0)
+ sb.append(_T(", "));
+ sb.append(terms[i]);
+ sb.appendChar('/');
- sb.appendInt((*termFreqs)[i]);
+ sb.appendInt((*termFreqs)[i]);
+ }
+ sb.appendChar('}');
+ return sb.toString();
}
-sb.appendChar('}');
-return sb.toString();
-}
int32_t SegmentTermVector::size() {
-if ( terms == NULL )
- return 0;
+ if ( terms == NULL )
+ return 0;
-if ( termsLen == -1 ){
- termsLen=0;
- while ( terms[termsLen] != 0 )
- termsLen++;
+ if ( termsLen == -1 ){
+ termsLen=0;
+ while ( terms[termsLen] != 0 )
+ termsLen++;
+ }
+ return termsLen;
}
-return termsLen;
-}
const TCHAR** SegmentTermVector::getTerms() {
return (const TCHAR**)terms;
}
-const Array<int32_t>* SegmentTermVector::getTermFrequencies() {
+const ValueArray<int32_t>* SegmentTermVector::getTermFrequencies() {
return termFreqs;
}
@@ -103,92 +101,63 @@
return res >= 0 ? res : -1;
}
-void SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, Array<int32_t>& ret) {
+ValueArray<int32_t>* SegmentTermVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len) {
// TODO: there must be a more efficient way of doing this.
// At least, we could advance the lower bound of the terms array
// as we find valid indexes. Also, it might be possible to leverage
// this even more by starting in the middle of the termNumbers array
// and thus dividing the terms array maybe in half with each found index.
- ret.length = len;
- ret.values = _CL_NEWARRAY(int32_t,len);
+ ValueArray<int32_t>* ret = _CLNEW ValueArray<int32_t>(len);
for (int32_t i=0; i<len; ++i) {
- ret.values[i] = indexOf(termNumbers[start+ i]);
+ ret->values[i] = indexOf(termNumbers[start+ i]);
}
+ return ret;
}
+void SegmentTermVector::indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, ValueArray<int32_t>& ret){
+ ret = *indexesOf(terms,start,len);
+}
-
-SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, Array<int32_t>* termFreqs, Array< Array<int32_t> >* positions, Array< Array<TermVectorOffsetInfo> >* offsets):
- SegmentTermVector(field,terms,termFreqs)
+SegmentTermPositionVector::SegmentTermPositionVector(const TCHAR* field, TCHAR** terms, ValueArray<int32_t>* termFreqs, ObjectArray< ValueArray<int32_t> >* _positions, ObjectArray< ObjectArray<TermVectorOffsetInfo> >* _offsets)
+ : SegmentTermVector(field,terms,termFreqs),offsets(_offsets),positions(_positions)
{
- this->offsets = offsets;
- this->positions = positions;
}
-
-void SegmentTermPositionVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, CL_NS(util)::Array<int32_t>& ret)
- { SegmentTermVector::indexesOf(termNumbers, start, len, ret); }
-
-
SegmentTermPositionVector::~SegmentTermPositionVector(){
- if ( offsets ){
- for (size_t i=0;i<offsets->length;i++){
- if ( offsets->values != NULL ){
- Array<TermVectorOffsetInfo>& offs = offsets->values[i];
- for ( size_t j=0;j<offs.length;j++ ){
- _CLDELETE_ARRAY(offs.values);
- }
- }
- }
- _CLDELETE_ARRAY(offsets->values);
- _CLDELETE(offsets);
- }
- if ( positions ){
- for (size_t i=0;i<positions->length;i++){
- if ( positions->values != NULL ){
- Array<int32_t>& pos = positions->values[i];
- for ( size_t j=0;j<pos.length;j++ ){
- _CLDELETE_ARRAY(pos.values);
- }
- }
- }
- _CLDELETE_ARRAY(positions->values);
- _CLDELETE(positions);
- }
+ _CLLDELETE(offsets);
+ _CLLDELETE(positions);
}
+ValueArray<int32_t>* SegmentTermPositionVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len)
+ { return SegmentTermVector::indexesOf(termNumbers, start, len); }
+
TermPositionVector* SegmentTermPositionVector::__asTermPositionVector(){
return this;
}
-/**
-* Returns an array of TermVectorOffsetInfo in which the term is found.
-*
-* @param index The position in the array to get the offsets from
-* @return An array of TermVectorOffsetInfo objects or the empty list
-* @see org.apache.lucene.analysis.Token
-*/
-Array<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(const size_t index) {
+
+ObjectArray<TermVectorOffsetInfo>* SegmentTermPositionVector::getOffsets(const size_t index) {
if(offsets == NULL)
return NULL;
if (index >=0 && index < offsets->length)
- return &offsets->values[index];
+ return offsets->values[index];
else
return &TermVectorOffsetInfo::EMPTY_OFFSET_INFO;
}
-/**
-* Returns an array of positions in which the term is found.
-* Terms are identified by the index at which its number appears in the
-* term String array obtained from the <code>indexOf</code> method.
-*/
-Array<int32_t>* SegmentTermPositionVector::getTermPositions(const size_t index) {
+ValueArray<int32_t>* SegmentTermPositionVector::getTermPositions(const size_t index) {
if(positions == NULL)
return NULL;
if (index >=0 && index < positions->length)
- return &positions->values[index];
+ return positions->values[index];
else
return &EMPTY_TERM_POS;
}
+
+void SegmentTermPositionVector::indexesOf(const TCHAR** termNumbers, const int32_t start, const int32_t len, CL_NS(util)::ValueArray<int32_t>& ret)
+{
+ ret = *indexesOf(termNumbers,start,len);
+}
+
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/index/TermVector.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/TermVector.h 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/TermVector.h 2009-04-12 13:21:07 UTC (rev 2985)
@@ -17,10 +17,10 @@
struct TermVectorOffsetInfo;
class TermPositionVector;
-/** Provides access to stored term vector of
- * a document field. The vector consists of the name of the field, an array of the terms tha occur in the field of the
- * {@link org.apache.lucene.document.Document} and a parallel array of frequencies. Thus, getTermFrequencies()[5] corresponds with the
- * frequency of getTerms()[5], assuming there are at least 5 terms in the Document.
+/** Provides access to stored term vector of
+ * a document field. The vector consists of the name of the field, an array of the terms tha occur in the field of the
+ * {@link org.apache.lucene.document.Document} and a parallel array of frequencies. Thus, getTermFrequencies()[5] corresponds with the
+ * frequency of getTerms()[5], assuming there are at least 5 terms in the Document.
*/
class CLUCENE_EXPORT TermFreqVector:LUCENE_BASE {
public:
@@ -28,7 +28,7 @@
}
/**
- * The {@link org.apache.lucene.document.Fieldable} name.
+ * The Field name.
* @return The name of the field this vector is associated with.
*
*/
@@ -53,7 +53,7 @@
* The size of the returned array is size()
* @memory Returning a pointer to internal data. Do not delete.
*/
- virtual const CL_NS(util)::Array<int32_t>* getTermFrequencies() = 0;
+ virtual const CL_NS(util)::ValueArray<int32_t>* getTermFrequencies() = 0;
/** Return an index in the term numbers array returned from
@@ -73,7 +73,7 @@
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
- virtual void indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, CL_NS(util)::Array<int32_t>& ret) = 0;
+ virtual void indexesOf(const TCHAR** terms, const int32_t start, const int32_t len, CL_NS(util)::ValueArray<int32_t>& ret) = 0;
/** Solve the diamond inheritence problem by providing a reinterpret function.
* No dynamic casting is required and no RTTI data is needed to do this
@@ -82,19 +82,45 @@
};
-
+/**
+* The TermVectorOffsetInfo class holds information pertaining to a Term in a {@link TermPositionVector}'s
+* offset information. This offset information is the character offset as set during the Analysis phase (and thus may not be the actual offset in the
+* original content).
+*/
struct CLUCENE_EXPORT TermVectorOffsetInfo {
+public:
+ /**
+ * Convenience declaration when creating a {@link org.apache.lucene.index.TermPositionVector} that stores only position information.
+ */
+private:
int startOffset;
int endOffset;
-public:
- static CL_NS(util)::Array<TermVectorOffsetInfo> EMPTY_OFFSET_INFO;
+public: // TODO: Remove after TermVectorWriter has been ported
+ static CL_NS(util)::ObjectArray<TermVectorOffsetInfo> EMPTY_OFFSET_INFO;
TermVectorOffsetInfo();
~TermVectorOffsetInfo();
TermVectorOffsetInfo(int32_t startOffset, int32_t endOffset);
+
+ /**
+ * The accessor for the ending offset for the term
+ * @return The offset
+ */
int32_t getEndOffset() const;
- void setEndOffset(int32_t endOffset);
+ void setEndOffset(const int32_t _endOffset);
+
+ /**
+ * The accessor for the starting offset of the term.
+ *
+ * @return The offset
+ */
int32_t getStartOffset() const;
- void setStartOffset(int32_t startOffset);
+ void setStartOffset(const int32_t _startOffset);
+
+ /**
+ * Two TermVectorOffsetInfos are equals if both the start and end offsets are the same
+ * @param o The comparison Object
+ * @return true if both {@link #getStartOffset()} and {@link #getEndOffset()} are the same for both objects.
+ */
bool equals(TermVectorOffsetInfo* o);
size_t hashCode() const;
};
@@ -112,7 +138,7 @@
* term String array obtained from the <code>indexOf</code> method.
* May return null if positions have not been stored.
*/
- virtual CL_NS(util)::Array<int32_t>* getTermPositions(const size_t index) = 0;
+ virtual CL_NS(util)::ValueArray<int32_t>* getTermPositions(const size_t index) = 0;
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
@@ -123,7 +149,7 @@
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
*/
- virtual CL_NS(util)::Array<TermVectorOffsetInfo>* getOffsets(const size_t index) = 0;
+ virtual CL_NS(util)::ObjectArray<TermVectorOffsetInfo>* getOffsets(const size_t index) = 0;
virtual ~TermPositionVector(){
}
Modified: branches/lucene2_3_2/src/core/CLucene/index/TermVectorReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/TermVectorReader.cpp 2009-04-12 12:53:18 UTC (rev 2984)
+++ branches/lucene2_3_2/src/core/CLucene/index/TermVectorReader.cpp 2009-04-12 13:21:07 UTC (rev 2985)
@@ -40,14 +40,14 @@
tvf = d->openInput(fbuf, readBufferSize);
tvfFormat = checkValidFormat(tvf);
if (-1 == docStoreOffset) {
- //this->docStoreOffset = 0;
- this->_size = static_cast<int32_t>(tvx->length() >> 3);
+ this->docStoreOffset = 0;
+ this->_size = static_cast<int64_t>(tvx->length() >> 3);
} else {
this->docStoreOffset = docStoreOffset;
this->_size = size;
// Verify the file is long enough to hold all of our
// docs
- CND_CONDITION( ((int32_t) (tvx->length() / 8)) >= size + docStoreOffset , "file is not ling enought to hold all our docs");
+ CND_CONDITION( ((int64_t) (tvx->length() / 8)) >= size + docStoreOffset , "file is not long enough to hold all of our docs");
}
}
@@ -63,34 +63,6 @@
close();
}
});
-/*
- char fbuf[CL_MAX_NAME];
- strcpy(fbuf,segment);
- char* fpbuf=fbuf+strlen(fbuf);
-
- strcpy(fpbuf, TermVectorsWriter::LUCENE_TVX_EXTENSION);
- if (d->fileExists(fbuf)) {
- tvx = d->openInput(fbuf);
- checkValidFormat(tvx);
-
- strcpy(fpbuf, TermVectorsWriter::LUCENE_TVD_EXTENSION);
- tvd = d->openInput(fbuf);
- tvdFormat = checkValidFormat(tvd);
-
- strcpy(fpbuf, TermVectorsWriter::LUCENE_TVF_EXTENSION);
- tvf = d->openInput(fbuf);
- tvfFormat = checkValidFormat(tvf);
-
- _size = tvx->length() / 8;
- }else{
- tvx = NULL;
- tvd = NULL;
- tvf = NULL;
- _size = 0;
- }
-
- this->fieldInfos = fieldInfos;
-*/
}
TermVectorsReader::TermVectorsReader(const TermVectorsReader& copy)
@@ -115,16 +87,32 @@
close();
}
+int32_t TermVectorsReader::checkValidFormat(CL_NS(store)::IndexInput* in){
+ int32_t format = in->readInt();
+ if (format > TermVectorsWriter::FORMAT_VERSION)
+ {
+ CL_NS(util)::StringBuffer err;
+ err.append(_T("Incompatible format version: "));
+ err.appendInt(format);
+ err.append(_T(" expected "));
+ err.appendInt(TermVectorsWriter::FORMAT_VERSION);
+ err.append(_T(" or less"));
+ _CLTHROWT(CL_ERR_CorruptIndex,err.getBuffer());
+ }
+ return format;
+}
+
void TermVectorsReader::close(){
- // why don't we trap the exception and at least make sure that
+ // make all effort to close up. Keep the first exception
+ // and throw it as a new one.
+ // todo: why don't we trap the exception and at least make sure that
// all streams that we can close are closed?
CLuceneError keep;
bool thrown = false;
if (tvx != NULL){
- try{
- tvx->close();
- }catch(CLuceneError& err){
+ try{tvx->close();}
+ catch(CLuceneError& err){
if ( err.number() == CL_ERR_IO ){
keep = err;
thrown = true;
@@ -134,9 +122,8 @@
_CLDELETE(tvx);//delete even if error thrown
}
if (tvd != NULL){
- try{
- tvd->close();
- }catch(CLuceneError& err){
+ try{tvd->close();}
+ catch(CLuceneError& err){
if ( err.number() == CL_ERR_IO ){
keep = err;
thrown = true;
@@ -146,9 +133,8 @@
_CLDELETE(tvd);
}
if (tvf != NULL){
- try{
- tvf->close();
- }catch(CLuceneError& err){
+ try{tvf->close();}
+ catch(CLuceneError& err){
if ( err.number() == CL_ERR_IO ){
keep = err;
thrown = true;
@@ -162,16 +148,18 @@
throw keep;
}
-TermFreqVector* TermVectorsReader::get(const int32_t docNum, const TCHAR* field){
- // Check if no term vectors are available for this segment at all
- int32_t fieldNumber = fieldInfos->fieldNumber(field);
- TermFreqVector* result = NULL;
- if (tvx != NULL) {
+int64_t TermVectorsReader::size() const{
+ return _size;
+}
+
+void TermVectorsReader::get(const int32_t docNum, const TCHAR* field, TermVectorMapper* mapper){
+ if (tvx != NULL) {
+ int32_t fieldNumber = fieldInfos->fieldNumber(field);
//We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file
- tvx->seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter::FORMAT_SIZE);
+ tvx->seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
int64_t position = tvx->readLong();
tvd->seek(position);
@@ -182,10 +170,11 @@
int32_t number = 0;
int32_t found = -1;
for (int32_t i = 0; i < fieldCount; ++i) {
- if(tvdFormat == TermVectorsWriter::FORMAT_VERSION)
+ if(tvdFormat == FORMAT_VERSION)
number = tvd->readVInt();
else
number += tvd->readVInt();
+
if (number == fieldNumber)
found = i;
}
@@ -195,20 +184,34 @@...
[truncated message content] |