Thread: [Clucene-cvs] SF.net SVN: clucene:[2879] branches/lucene2_3_2/src/core

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2879
          http://clucene.svn.sourceforge.net/clucene/?rev=2879&view=rev
Author:   synhershko
Date:     2008-08-27 12:53:20 +0000 (Wed, 27 Aug 2008)

Log Message:
-----------
Field now uses one pointer only to hold the content of various field types. Due to this change Internal is being used less and might be removed later.
More code porting into FieldsReader
IndexInput::skipChars made protected as per JL
Introducing FieldSelector and FieldSelectorResult

Modified Paths:
--------------
    branches/lucene2_3_2/src/core/CLucene/document/Field.cpp
    branches/lucene2_3_2/src/core/CLucene/document/Field.h
    branches/lucene2_3_2/src/core/CLucene/files_list.txt
    branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp
    branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h
    branches/lucene2_3_2/src/core/CLucene/store/IndexInput.h
    branches/lucene2_3_2/src/core/CMakeLists.txt

Added Paths:
-----------
    branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp
    branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h

Modified: branches/lucene2_3_2/src/core/CLucene/document/Field.cpp
===================================================================

--- branches/lucene2_3_2/src/core/CLucene/document/Field.cpp	2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/document/Field.cpp	2008-08-27 12:53:20 UTC (rev 2879)
@@ -15,32 +15,36 @@
 
 struct Field::Internal{
 	const TCHAR* _name;
-	TCHAR* _stringValue;
-	CL_NS(util)::Reader* _readerValue;
-    jstreams::StreamBase<char>* _streamValue;
+	//TCHAR* _stringValue;
+	//CL_NS(util)::Reader* _readerValue;
+    //jstreams::StreamBase<char>* _streamValue;
+	//void* fieldsData;
 
 	uint32_t config;
 	float_t boost;
 };
 
 Field::Field(const TCHAR* Name, Reader* reader, int config):
-	internal(new Internal)
+	_internal(new Internal), lazy(false)
 {
 	CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
 	CND_PRECONDITION(reader != NULL, "reader cannot be NULL");
 
-	internal->_name        = CLStringIntern::intern( Name );
-	internal->_stringValue = NULL;
-	internal->_readerValue = reader;
-	internal->_streamValue = NULL;
-	internal->boost=1.0f;
+	_internal->_name        = CLStringIntern::intern( Name );
+	//_internal->_stringValue = NULL;
+	//_internal->_readerValue = reader;
+	//_internal->_streamValue = NULL;
+	fieldsData = reader;
+	valueType = VALUE_READER;
 
+	_internal->boost=1.0f;
+
 	setConfig(config);
 }
 
 
 Field::Field(const TCHAR* Name, const TCHAR* Value, int _config):
-	internal(new Internal)
+	_internal(new Internal), lazy(false)
 {
 	CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
 	CND_PRECONDITION(Value != NULL, "value cannot be NULL");
@@ -53,87 +57,111 @@
 		_CLTHROWA(CL_ERR_IllegalArgument,"cannot store term vector information for a field that is not indexed");
 	*/
 
-	internal->_name        = CLStringIntern::intern( Name );
-	internal->_stringValue = stringDuplicate( Value );
-	internal->_readerValue = NULL;
-	internal->_streamValue = NULL;
-	internal->boost=1.0f;
+	_internal->_name        = CLStringIntern::intern( Name );
+	//_internal->_stringValue = stringDuplicate( Value );
+	//_internal->_readerValue = NULL;
+	//_internal->_streamValue = NULL;
+	fieldsData = stringDuplicate( Value );
+	valueType = VALUE_STRING;
 
+	_internal->boost=1.0f;
+
 	//config = INDEX_TOKENIZED; // default Field is tokenized and indexed
 
 	setConfig(_config);
 }
 
 Field::Field(const TCHAR* Name, jstreams::StreamBase<char>* Value, int config):
-	internal(new Internal)
+	_internal(new Internal), lazy(false)
 {
 	CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
 	CND_PRECONDITION(Value != NULL, "value cannot be NULL");
 
-	internal->_name        = CLStringIntern::intern( Name );
-	internal->_stringValue = NULL;
-	internal->_readerValue = NULL;
-	internal->_streamValue = Value;
-	internal->boost=1.0f;
+	_internal->_name        = CLStringIntern::intern( Name );
+	//_internal->_stringValue = NULL;
+	//_internal->_readerValue = NULL;
+	//_internal->_streamValue = Value;
+	fieldsData = Value;
+	valueType = VALUE_STREAM;
 
+	_internal->boost=1.0f;
+
 	setConfig(config);
 }
 
+Field::Field(const TCHAR* Name, int config):
+	_internal(new Internal), lazy(false)
+{
+	CND_PRECONDITION(Name != NULL, "Name cannot be NULL");
+
+	_internal->_name        = CLStringIntern::intern( Name );
+	fieldsData = NULL;
+	valueType = VALUE_NONE;
+
+	_internal->boost=1.0f;
+
+	setConfig(config);
+}
+
 Field::~Field(){
 //Func - Destructor
 //Pre  - true
 //Post - Instance has been destroyed
 
-	CLStringIntern::unintern(internal->_name);
+	CLStringIntern::unintern(_internal->_name);
 	_resetValue();
-	delete internal;
+	delete _internal;
 }
 
 
 /*===============FIELDS=======================*/
-const TCHAR* Field::name() const	{ return internal->_name; } ///<returns reference
-TCHAR* Field::stringValue() const	{ return internal->_stringValue; } ///<returns reference
-Reader* Field::readerValue() const	{ return internal->_readerValue; } ///<returns reference
-jstreams::StreamBase<char>* Field::streamValue() const	{ return internal->_streamValue; } ///<returns reference
-CL_NS(analysis)::TokenStream* Field::tokenStreamValue() const { return NULL; }
+const TCHAR* Field::name() const	{ return _internal->_name; } ///<returns reference
+TCHAR* Field::stringValue() const	{ return (valueType & VALUE_STRING) ? static_cast<TCHAR*>(fieldsData) : NULL; } ///<returns reference
+Reader* Field::readerValue() const	{ return (valueType & VALUE_READER) ? static_cast<Reader*>(fieldsData) : NULL; } ///<returns reference
+jstreams::StreamBase<char>* Field::streamValue() const	{ return (valueType & VALUE_STREAM) ? static_cast<jstreams::StreamBase<char>*>(fieldsData) : NULL; } ///<returns reference
+CL_NS(analysis)::TokenStream* Field::tokenStreamValue() const { return (valueType & VALUE_TOKENSTREAM) ? static_cast<CL_NS(analysis)::TokenStream*>(fieldsData) : NULL; }
 	    
-bool	Field::isStored() const 	{ return (internal->config & STORE_YES) != 0; }
-bool 	Field::isIndexed() const	{ return (internal->config & INDEX_TOKENIZED)!=0 || (internal->config & INDEX_UNTOKENIZED)!=0; }
-bool 	Field::isTokenized() const	{ return (internal->config & INDEX_TOKENIZED) != 0; }
-bool 	Field::isCompressed() const	{ return (internal->config & STORE_COMPRESS) != 0; }
-bool 	Field::isBinary() const		{ return internal->_streamValue!=NULL; }
+bool	Field::isStored() const 	{ return (_internal->config & STORE_YES) != 0; }
+bool 	Field::isIndexed() const	{ return (_internal->config & INDEX_TOKENIZED)!=0 || (_internal->config & INDEX_UNTOKENIZED)!=0; }
+bool 	Field::isTokenized() const	{ return (_internal->config & INDEX_TOKENIZED) != 0; }
+bool 	Field::isCompressed() const	{ return (_internal->config & STORE_COMPRESS) != 0; }
+bool 	Field::isBinary() const		{ return (valueType & VALUE_STREAM) && fieldsData!=NULL; }
 
-bool	Field::isTermVectorStored() const			{ return (internal->config & TERMVECTOR_YES) != 0; }
-bool	Field::isStoreOffsetWithTermVector() const	{ return (internal->config & TERMVECTOR_YES) != 0 && (internal->config & TERMVECTOR_WITH_OFFSETS) != 0; }
-bool	Field::isStorePositionWithTermVector() const{ return (internal->config & TERMVECTOR_YES) != 0 && (internal->config & TERMVECTOR_WITH_POSITIONS) != 0; }
+bool	Field::isTermVectorStored() const			{ return (_internal->config & TERMVECTOR_YES) != 0; }
+bool	Field::isStoreOffsetWithTermVector() const	{ return (_internal->config & TERMVECTOR_YES) != 0 && (_internal->config & TERMVECTOR_WITH_OFFSETS) != 0; }
+bool	Field::isStorePositionWithTermVector() const{ return (_internal->config & TERMVECTOR_YES) != 0 && (_internal->config & TERMVECTOR_WITH_POSITIONS) != 0; }
 
-bool Field::getOmitNorms() const { return (internal->config & INDEX_NONORMS) != 0; }
-void Field::setOmitNorms(const bool omitNorms) { internal->config |= INDEX_NONORMS; }
+bool Field::getOmitNorms() const { return (_internal->config & INDEX_NONORMS) != 0; }
+void Field::setOmitNorms(const bool omitNorms) { _internal->config |= INDEX_NONORMS; }
     
-bool Field::isLazy() const { return (internal->config & LAZY_YES) != 0; }
+bool Field::isLazy() const { return lazy; }
 
 void Field::setValue(const TCHAR* value) {
 	_resetValue();
-	internal->_stringValue = stringDuplicate( value );
+	fieldsData = stringDuplicate( value );
+	valueType = VALUE_STRING;
 }
 
 void Field::setValue(CL_NS(util)::Reader* value) {
 	_resetValue();
-	internal->_readerValue = value;
+	fieldsData = value;
+	valueType = VALUE_READER;
 }
 void Field::setValue(jstreams::StreamBase<char>* value) {
 	_resetValue();
-	internal->_streamValue = value;
+	fieldsData = value;
+	valueType = VALUE_STREAM;
 }
 
 /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
 void Field::setValue(CL_NS(analysis)::TokenStream* value) {
-	_resetValue();
+	//_resetValue();
 	//fieldsData = value;
+	//valueType = VALUE_TOKENSTREAM;
 }
 
-void Field::setBoost(const float_t boost)	{ this->internal->boost = boost; }
-float_t Field::getBoost() const				{ return internal->boost; }
+void Field::setBoost(const float_t boost)	{ this->_internal->boost = boost; }
+float_t Field::getBoost() const				{ return _internal->boost; }
 
 void Field::setConfig(const uint32_t x){
 	uint32_t newConfig=0;
@@ -200,7 +228,7 @@
 	}else
 		newConfig |= TERMVECTOR_NO;
 
-	internal->config = newConfig;
+	_internal->config = newConfig;
 }
 
 TCHAR* Field::toString() {
@@ -252,12 +280,12 @@
     result.append(name());
     result.appendChar(':');
     
-	if (! isLazy()) {
-		if (internal->_stringValue != NULL)
-			result.append(internal->_stringValue);
-		else if ( internal->_readerValue != NULL )
+	if (! isLazy() && fieldsData != NULL) {
+		if (valueType & VALUE_STRING)
+			result.append(static_cast<const TCHAR*>(fieldsData));
+		else if (valueType & VALUE_READER)
 			result.append( _T("Reader") );
-		else if ( internal->_streamValue != NULL )
+		else if (valueType & VALUE_STREAM)
 			result.append( _T("Stream") );
 		else
 			result.append( _T("NULL") );
@@ -269,9 +297,14 @@
 
 
 void Field::_resetValue() {
-	_CLDELETE_CARRAY(internal->_stringValue);
-	_CLDELETE(internal->_readerValue);
-	_CLVDELETE( internal->_streamValue );
+	if (valueType & VALUE_STRING) {
+		_CLDELETE_CARRAY(fieldsData);
+	} else if (valueType & VALUE_READER) {
+		_CLDELETE(fieldsData);
+	} else if (valueType & VALUE_STREAM) {
+		_CLVDELETE( fieldsData );
+	}
+	valueType=VALUE_NONE;
 }
 
 CL_NS_END

Modified: branches/lucene2_3_2/src/core/CLucene/document/Field.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/Field.h	2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/document/Field.h	2008-08-27 12:53:20 UTC (rev 2879)
@@ -15,7 +15,6 @@
 TODO: - Solve some inconsistencies between CL and JL - mainly in the constructors area.
 	  - Write some more tests to make sure we conform with JL - mainly in the tokenizing and omitNorms area
 	  - Is there a bug in JL when calling setOmitNorms after a Tokenized field was created?
-	  - TokenStream* implementation - mend all 3 pointers to one void* ?
 */
 
 CL_CLASS_DEF(util,Reader)
@@ -41,8 +40,9 @@
 text), so that they can be loaded lazily.
 */
 class CLUCENE_EXPORT Field :LUCENE_BASE{
+private:
     struct Internal;
-    Internal* internal;
+    Internal* _internal;
 public:
 	enum Store{ 
 		/** Store the original field value in the index. This is useful for short texts
@@ -128,11 +128,20 @@
 		TERMVECTOR_WITH_POSITIONS_OFFSETS = TERMVECTOR_WITH_OFFSETS | TERMVECTOR_WITH_POSITIONS
 	};
 
-	enum { LAZY_YES = 4096 };
+	bool lazy;
 
+	enum ValueType {
+		VALUE_NONE = 0,
+		VALUE_STRING = 1,
+		VALUE_READER = 2,
+		VALUE_STREAM = 4,
+		VALUE_TOKENSTREAM = 8
+	};
+
 	Field(const TCHAR* name, const TCHAR* value, int _config);
 	Field(const TCHAR* name, CL_NS(util)::Reader* reader, int _config);
 	Field(const TCHAR* name, jstreams::StreamBase<char>* stream, int _config);
+	Field(const TCHAR* name, int _config); ///<No value, for lazy loading support
     ~Field();
 
 	/**  The name of the field (e.g., "date", "subject", "title", "body", etc.)
@@ -159,16 +168,16 @@
 	* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
 	CL_NS(analysis)::TokenStream* tokenStreamValue() const;
 
-	//  True iff the value of the field is to be stored in the index for return
+	//  True if the value of the field is to be stored in the index for return
 	//	with search hits.  It is an error for this to be true if a field is
 	//	Reader-valued. 
 	bool isStored() const;
 
-	//  True iff the value of the field is to be indexed, so that it may be
+	//  True if the value of the field is to be indexed, so that it may be
 	//	searched on. 
 	bool isIndexed() const;
 
-	// True iff the value of the field should be tokenized as text prior to
+	// True if the value of the field should be tokenized as text prior to
 	//	indexing.  Un-tokenized fields are indexed as a single word and may not be
 	//	Reader-valued.
 	bool isTokenized() const;
@@ -181,7 +190,7 @@
 	*/
 	bool isCompressed() const;
 
-	/** True iff the term or terms used to index this field are stored as a term
+	/** True if the term or terms used to index this field are stored as a term
 	*  vector, available from {@link IndexReader#getTermFreqVector(int32_t,TCHAR*)}.
 	*  These methods do not provide access to the original content of the field,
 	*  only to terms used to index it. If the original content must be
@@ -290,6 +299,9 @@
 	inline void setConfig(const uint32_t termVector);
 
 	inline void _resetValue();
+
+	void* fieldsData;
+	ValueType valueType;
 };
 CL_NS_END
 #endif

Added: branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp	                        (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/document/FieldSelector.cpp	2008-08-27 12:53:20 UTC (rev 2879)
@@ -0,0 +1,22 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+
+#include "_FieldSelector.h"
+
+CL_NS_DEF(document)
+
+FieldSelector::~FieldSelector(){
+}
+
+LoadFirstFieldSelector::~LoadFirstFieldSelector(){
+}
+
+FieldSelectorResult LoadFirstFieldSelector::accept(const TCHAR* fieldName) {
+	return FieldSelectorResult::LOAD_AND_BREAK;
+}
+CL_NS_END

Added: branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h	                        (rev 0)
+++ branches/lucene2_3_2/src/core/CLucene/document/_FieldSelector.h	2008-08-27 12:53:20 UTC (rev 2879)
@@ -0,0 +1,101 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_document_FieldSelector_
+#define _lucene_document_FieldSelector_
+
+
+CL_NS_DEF(document)
+
+/**
+ *  Provides information about what should be done with this Field 
+ *
+ **/
+static enum FieldSelectorResult {
+	/**
+     * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encounterd.
+     *  {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
+     *<p/>
+     * {@link Document#add(Fieldable)} should be called by the Reader.
+     */
+	LOAD = 0,
+
+    /**
+     * Lazily load this {@link Field}.  This means the {@link Field} is valid, but it may not actually contain its data until
+     * invoked.  {@link Document#getField(String)} SHOULD NOT BE USED.  {@link Document#getFieldable(String)} is safe to use and should
+     * return a valid instance of a {@link Fieldable}.
+     *<p/>
+     * {@link Document#add(Fieldable)} should be called by the Reader.
+     */
+	 LAZY_LOAD = 1,
+
+     /**
+     * Do not load the {@link Field}.  {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null.
+     * {@link Document#add(Fieldable)} is not called.
+     * <p/>
+     * {@link Document#add(Fieldable)} should not be called by the Reader.
+     */
+	 NO_LOAD = 2,
+
+     /**
+     * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}.  Thus, the
+     * Document may not have its complete set of Fields.  {@link Document#getField(String)} and {@link Document#getFieldable(String)} should
+     * both be valid for this {@link Field}
+     * <p/>
+     * {@link Document#add(Fieldable)} should be called by the Reader.
+     */
+	 LOAD_AND_BREAK = 3,
+
+     /**
+     * Behaves much like {@link #LOAD} but does not uncompress any compressed data.  This is used for internal purposes.
+     * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null.
+     * <p/>
+     * {@link Document#add(Fieldable)} should be called by the Reader.
+     */
+	 LOAD_FOR_MERGE = 4,
+
+      /** Expert:  Load the size of this {@link Field} rather than its value.
+      * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value.
+      * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0]
+      */
+	  SIZE = 5,
+
+	  /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */         
+	  SIZE_AND_BREAK = 6
+};
+
+/**
+ * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about
+ * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)}
+ *
+ **/
+class FieldSelector :LUCENE_BASE {
+public:
+	~FieldSelector();
+
+	/**
+	* 
+	* @param fieldName the field to accept or reject
+	* @return an instance of {@link FieldSelectorResult}
+	* if the {@link Field} named <code>fieldName</code> should be loaded.
+	*/
+	virtual FieldSelectorResult accept(const TCHAR* fieldName) = 0;
+};
+
+/**
+ * Load the First field and break.
+ * <p/>
+ * See {@link FieldSelectorResult#LOAD_AND_BREAK}
+ */
+class LoadFirstFieldSelector :FieldSelector {
+public:
+	~LoadFirstFieldSelector();
+
+	FieldSelectorResult accept(const TCHAR* fieldName);
+};
+
+CL_NS_END
+#endif

Modified: branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp	2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/index/FieldsReader.cpp	2008-08-27 12:53:20 UTC (rev 2879)
@@ -13,7 +13,7 @@
 #include "CLucene/store/Directory.h"
 #include "CLucene/store/IndexInput.h"
 #include "CLucene/document/Document.h"
-#include "CLucene/document/Field.h"
+#include "CLucene/document/_FieldSelector.h"
 #include "_FieldInfos.h"
 #include "_FieldsWriter.h"
 #include "_FieldsReader.h"
@@ -60,8 +60,7 @@
 		indexStream = d->openInput( buf, _readBufferSize );
 		_CLDELETE_CaARRAY( buf );
 
-		/*
-		if (docStoreOffset != -1) {
+		if (_docStoreOffset != -1) {
 			// We read only a slice out of this shared fields file
 			this->docStoreOffset = _docStoreOffset;
 			this->_size = size;
@@ -73,9 +72,9 @@
 		} else {
 			this->docStoreOffset = 0;
 			this->_size = (int32_t) (indexStream->length() >> 3);
-		}*/
+		}
 
-		_size = (int32_t)indexStream->length()/8; //todo: remove when uncommenting block above
+		//_size = (int32_t)indexStream->length()/8;
 
 		numTotalDocs = (int32_t) (indexStream->length() >> 3);
 		success = true;
@@ -133,10 +132,10 @@
 	return _size;
 }
 
-bool FieldsReader::doc(int32_t n, Document* doc) {
-    if ( n * 8L > indexStream->length() )
+bool FieldsReader::doc(int32_t n, Document* doc, CL_NS(document)::FieldSelector* fieldSelector) {
+    if ( (n + docStoreOffset) * 8L > indexStream->length() )
         return false;
-	indexStream->seek(n * 8L);
+	indexStream->seek((n + docStoreOffset) * 8L);
 	int64_t position = indexStream->readLong();
 	fieldsStream->seek(position);
     
@@ -144,11 +143,14 @@
 	for (int32_t i = 0; i < numFields; i++) {
 		int32_t fieldNumber = fieldsStream->readVInt();
 		FieldInfo* fi = fieldInfos->fieldInfo(fieldNumber);
+        if ( fi == NULL ) _CLTHROWA(CL_ERR_IO, "Field stream is invalid");
 
-        if ( fi == NULL )
-            _CLTHROWA(CL_ERR_IO, "Field stream is invalid");
+		FieldSelectorResult acceptField = (fieldSelector == NULL) ? CL_NS(document)::LOAD : fieldSelector->accept(fi->name);
 
 		uint8_t bits = fieldsStream->readByte();
+		CND_CONDITION(bits <= FieldsWriter::FIELD_IS_COMPRESSED + FieldsWriter::FIELD_IS_TOKENIZED + FieldsWriter::FIELD_IS_BINARY,
+			"invalid field bits");
+
 		if ((bits & FieldsWriter::FIELD_IS_BINARY) != 0) {
 			int32_t fieldLen = fieldsStream->readVInt();
             FieldsReader::FieldsStreamHolder* subStream = new FieldsReader::FieldsStreamHolder(fieldsStream, fieldLen);
@@ -215,7 +217,7 @@
 					fieldsStream->seek(fieldsStream->getFilePointer() + fieldLen);
 	        }else {
 				TCHAR* fvalue = fieldsStream->readString();
-	          	Field* f = _CLNEW Field(
+				Field* f = _CLNEW Field(
 					fi->name,     // name
 	                fvalue, // read value
 	                bits);
@@ -229,6 +231,124 @@
 	return true;
 }
 
+CL_NS(store)::IndexInput* FieldsReader::rawDocs(int32_t* lengths, const int32_t startDocID, const int32_t numDocs) {
+	indexStream->seek((docStoreOffset+startDocID) * 8L);
+	int64_t startOffset = indexStream->readLong();
+	int64_t lastOffset = startOffset;
+	int32_t count = 0;
+	while (count < numDocs) {
+		int64_t offset;
+		const int32_t docID = docStoreOffset + startDocID + count + 1;
+		CND_CONDITION( docID <= numTotalDocs, "invalid docID");
+		if (docID < numTotalDocs) 
+			offset = indexStream->readLong();
+		else
+			offset = fieldsStream->length();
+		lengths[count++] = static_cast<int32_t>(offset-lastOffset);
+		lastOffset = offset;
+	}
+
+	fieldsStream->seek(startOffset);
+
+	return fieldsStream;
+}
+
+void FieldsReader::skipField(const bool binary, const bool compressed) {
+	skipField(binary, compressed, fieldsStream->readVInt());
+}
+
+void FieldsReader::skipField(const bool binary, const bool compressed, const int32_t toRead) {
+	if (binary || compressed) {
+		int64_t pointer = fieldsStream->getFilePointer();
+		fieldsStream->seek(pointer + toRead);
+	} else {
+		//We need to skip chars.  This will slow us down, but still better
+		fieldsStream->skipChars(toRead);
+	}
+}
+
+void FieldsReader::addFieldLazy(CL_NS(document)::Document* doc, FieldInfo* fi, const bool binary,
+								const bool compressed, const bool tokenize) {
+	if (binary) {
+		int32_t toRead = fieldsStream->readVInt();
+		int64_t pointer = fieldsStream->getFilePointer();
+		if (compressed) {
+			//was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
+			doc->add(*_CLNEW LazyField(this, fi->name, Field::STORE_COMPRESS, toRead, pointer));
+		} else {
+			//was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
+			doc->add(*_CLNEW LazyField(this, fi->name, Field::STORE_YES, toRead, pointer));
+		}
+		//Need to move the pointer ahead by toRead positions
+		fieldsStream->seek(pointer + toRead);
+	} else {
+		//Field.Store store = Field.Store.YES;
+		//Field.Index index = getIndexType(fi, tokenize);
+		//Field.TermVector termVector = getTermVectorType(fi);
+
+		LazyField* f = NULL;
+		if (compressed) {
+			int32_t toRead = fieldsStream->readVInt();
+			int64_t pointer = fieldsStream->getFilePointer();
+			f = _CLNEW LazyField(this, fi->name, Field::STORE_COMPRESS, toRead, pointer);
+			//skip over the part that we aren't loading
+			fieldsStream->seek(pointer + toRead);
+			f->setOmitNorms(fi->omitNorms);
+		} else {
+			int32_t length = fieldsStream->readVInt();
+			int64_t pointer = fieldsStream->getFilePointer();
+			//Skip ahead of where we are by the length of what is stored
+			fieldsStream->skipChars(length);
+			f = _CLNEW LazyField(this, fi->name, Field::STORE_YES | getIndexType(fi, tokenize) | getTermVectorType(fi), length, pointer);
+			f->setOmitNorms(fi->omitNorms);
+		}
+		doc->add(*f);
+	}
+}
+
+/*
+// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
+// Read just the size -- caller must skip the field content to continue reading fields
+// Return the size in bytes or chars, depending on field type
+int32_t FieldsReader::addFieldSize(const CL_NS(document)::Document* doc, const FieldInfo* fi, const bool binary, const bool compressed) {
+	const int32_t size = fieldsStream->readVInt();
+	const int32_t bytesize = binary || compressed ? size : 2*size;
+	uint8_t* sizebytes = _CL_NEWARRAY(byte, 4);
+	sizebytes[0] = (byte) (bytesize>>>24);
+	sizebytes[1] = (byte) (bytesize>>>16);
+	sizebytes[2] = (byte) (bytesize>>> 8);
+	sizebytes[3] = (byte)  bytesize      ;
+	doc->add(*_CLNEW Field(fi->name, sizebytes, Field::STORE_YES));
+	return size;
+}*/
+
+CL_NS(document)::Field::TermVector FieldsReader::getTermVectorType(const FieldInfo* fi) {
+	if (fi->storeTermVector) {
+		if (fi->storeOffsetWithTermVector) {
+			if (fi->storePositionWithTermVector) {
+				return Field::TERMVECTOR_WITH_POSITIONS_OFFSETS;
+			} else {
+				return Field::TERMVECTOR_WITH_OFFSETS;
+			}
+		} else if (fi->storePositionWithTermVector) {
+			return Field::TERMVECTOR_WITH_POSITIONS;
+		} else {
+			return Field::TERMVECTOR_YES;
+		}
+	} else {
+		return Field::TERMVECTOR_NO ;
+	}
+}
+
+CL_NS(document)::Field::Index FieldsReader::getIndexType(const FieldInfo* fi, const bool tokenize) {
+	if (fi->isIndexed && tokenize)
+		return Field::INDEX_TOKENIZED;
+	else if (fi->isIndexed && !tokenize)
+		return Field::INDEX_UNTOKENIZED;
+	else
+		return Field::INDEX_NO;
+}
+
 FieldsReader::FieldsStreamHolder::FieldsStreamHolder(IndexInput* indexInput, int32_t subLength){
     this->indexInput = indexInput->clone();
     this->indexInputStream = new IndexInputStream(this->indexInput);
@@ -268,4 +388,98 @@
     return ret;
 }
 
+
+FieldsReader::LazyField::LazyField(FieldsReader* _parent, const TCHAR* _name,
+								   int config, const int32_t _toRead, const int64_t _pointer)
+: Field(_name, config), parent(_parent) {
+	// todo: need to allow for auto setting Field::INDEX_NO | Field::TERMVECTOR_NO so only Store is required
+	this->toRead = _toRead;
+	this->pointer = _pointer;
+	lazy = true;
+}
+
+CL_NS(store)::IndexInput* FieldsReader::LazyField::getFieldStream() {
+	CL_NS(store)::IndexInput* localFieldsStream = parent->fieldsStreamTL.get();
+	if (localFieldsStream == NULL) {
+		localFieldsStream = parent->cloneableFieldsStream->clone();
+		parent->fieldsStreamTL.set(localFieldsStream);
+	}
+	return localFieldsStream;
+}
+
+uint8_t* FieldsReader::LazyField::binaryValue() {
+	parent->ensureOpen();
+	if (fieldsData == NULL) {
+		uint8_t* b = _CL_NEWARRAY(uint8_t, toRead);
+		CL_NS(store)::IndexInput* localFieldsStream = getFieldStream();
+
+		localFieldsStream->seek(pointer);
+		localFieldsStream->readBytes(b, toRead);
+		if (isCompressed()) {
+			//fieldsData = uncompress(b);
+		} else {
+			fieldsData = b;
+		}
+		valueType = VALUE_STREAM;
+	}
+	return static_cast<uint8_t*>(fieldsData); // instanceof byte[] ? (byte[]) fieldsData : null;
+}
+
+CL_NS(util)::Reader* FieldsReader::LazyField::readerValue() {
+	parent->ensureOpen();
+	return (valueType & VALUE_READER) ? static_cast<CL_NS(util)::Reader*>(fieldsData) : NULL;
+}
+
+
+CL_NS(analysis)::TokenStream* FieldsReader::LazyField::tokenStreamValue() {
+	parent->ensureOpen();
+	return (valueType & VALUE_TOKENSTREAM) ? static_cast<CL_NS(analysis)::TokenStream*>(fieldsData) : NULL;
+}
+
+
+/** The value of the field as a String, or null.  If null, the Reader value,
+* binary value, or TokenStream value is used.  Exactly one of stringValue(), 
+* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+const TCHAR* FieldsReader::LazyField::stringValue() {
+	parent->ensureOpen();
+	if (fieldsData == NULL) {
+		CL_NS(store)::IndexInput* localFieldsStream = getFieldStream();
+		localFieldsStream->seek(pointer);
+		if (isCompressed()) {
+			uint8_t* b = _CL_NEWARRAY(uint8_t, toRead);
+			localFieldsStream->readBytes(b, toRead);
+			_resetValue();
+			//fieldsData = new String(uncompress(b), "UTF-8");
+		} else {
+			//read in chars b/c we already know the length we need to read
+			TCHAR* chars = _CL_NEWARRAY(TCHAR, toRead);
+			localFieldsStream->readChars(chars, 0, toRead);
+			_resetValue();
+			fieldsData = chars;
+		}
+		valueType = VALUE_STRING;
+	}
+	return static_cast<const TCHAR*>(fieldsData); //instanceof String ? (String) fieldsData : null;
+}
+
+int64_t FieldsReader::LazyField::getPointer() const {
+	parent->ensureOpen();
+	return pointer;
+}
+
+void FieldsReader::LazyField::setPointer(const int64_t _pointer) {
+	parent->ensureOpen();
+	this->pointer = _pointer;
+}
+
+int32_t FieldsReader::LazyField::getToRead() const {
+	parent->ensureOpen();
+	return toRead;
+}
+
+void FieldsReader::LazyField::setToRead(const int32_t _toRead) {
+	parent->ensureOpen();
+	this->toRead = _toRead;
+}
+
 CL_NS_END

Modified: branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h	2008-08-27 08:10:53 UTC (rev 2878)
+++ branches/lucene2_3_2/src/core/CLucene/index/_FieldsReader.h	2008-08-27 12:53:20 UTC (rev 2879)
@@ -9,20 +9,22 @@
 ...
 
[truncated message content]

Thread: [Clucene-cvs] SF.net SVN: clucene:[2879] branches/lucene2_3_2/src/core

clucene-cvs