[Clucene-cvs] SF.net SVN: clucene:[3007] branches/lucene2_3_2/src/core/CLucene/util

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3007
          http://clucene.svn.sourceforge.net/clucene/?rev=3007&view=rev
Author:   ustramooner
Date:     2009-06-11 16:18:14 +0000 (Thu, 11 Jun 2009)

Log Message:
-----------
dgap bitvectors

Modified Paths:
--------------
    branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp
    branches/lucene2_3_2/src/core/CLucene/util/BitSet.h

Modified: branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp
===================================================================

--- branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp	2009-05-04 14:46:40 UTC (rev 3006)
+++ branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp	2009-06-11 16:18:14 UTC (rev 3007)
@@ -13,6 +13,25 @@
 CL_NS_USE(store)
 CL_NS_DEF(util)
 
+
+const uint8_t BitSet::BYTE_COUNTS[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
 BitSet::BitSet( const BitSet& copy ) :
 	_size( copy._size ),
 	_count(-1)
@@ -36,11 +55,12 @@
 	_count=-1;
 	CL_NS(store)::IndexInput* input = d->openInput( name );
 	try {
-		_size = input->readInt();			  // read size
-	  	_count = input->readInt();			  // read count
-
-		bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1);		  // allocate bits
-		input->readBytes(bits, (_size >> 3) + 1);	  // read bits
+		 _size = input->readInt();			  // read size
+     if (_size == -1) {
+      readDgaps(input);
+     } else {
+      readBits(input);
+     }
 	} _CLFINALLY (
 	    input->close();
 	    _CLDELETE(input );
@@ -50,9 +70,11 @@
 void BitSet::write(CL_NS(store)::Directory* d, const char* name) {
 	CL_NS(store)::IndexOutput* output = d->createOutput(name);
 	try {
-	  output->writeInt(size());			  // write size
-	  output->writeInt(count());			  // write count
-	  output->writeBytes(bits, (_size >> 3) + 1);	  // write bits
+    if (isSparse()) {
+      writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
+    } else {
+      writeBits(output);
+    }
 	} _CLFINALLY (
 	    output->close();
 	    _CLDELETE(output);
@@ -62,7 +84,12 @@
 	_CLDELETE_ARRAY(bits);
 }
 
+
 void BitSet::set(const int32_t bit, bool val){
+    if (bit >= _size) {
+	      _CLTHROWA(CL_ERR_IndexOutOfBounds, "bit out of range");
+    }
+
 	_count = -1;
 
 	if (val)
@@ -78,23 +105,6 @@
 int32_t BitSet::count(){
 	// if the BitSet has been modified
     if (_count == -1) {
-      static const uint8_t BYTE_COUNTS[] = {  
-            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-            4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
 
       int32_t c = 0;
       int32_t end = (_size >> 3) + 1;
@@ -108,4 +118,66 @@
 	return _CLNEW BitSet( *this );
 }
 
+  /** Read as a bit set */
+  void BitSet::readBits(IndexInput* input) {
+    _count = input->readInt();        // read count
+    bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1);      // allocate bits
+    input->readBytes(bits, (_size >> 3) + 1);   // read bits
+  }
+
+  /** read as a d-gaps list */
+  void BitSet::readDgaps(IndexInput* input) {
+    _size = input->readInt();       // (re)read size
+    _count = input->readInt();        // read count
+    bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1);     // allocate bits
+    int32_t last=0;
+    int32_t n = count();
+    while (n>0) {
+      last += input->readVInt();
+      bits[last] = input->readByte();
+      n -= BYTE_COUNTS[bits[last] & 0xFF];
+    }
+  }
+
+  /** Write as a bit set */
+   void BitSet::writeBits(IndexOutput* output) {
+    output->writeInt(size());       // write size
+    output->writeInt(count());        // write count
+    output->writeBytes(bits, (_size >> 3) + 1);   // write bits
+  }
+
+  /** Write as a d-gaps list */
+  void BitSet::writeDgaps(IndexOutput* output) {
+    output->writeInt(-1);            // mark using d-gaps
+    output->writeInt(size());        // write size
+    output->writeInt(count());       // write count
+    int32_t last=0;
+    int32_t n = count();
+    int32_t m = (_size >> 3);
+    for (int32_t i=0; i<m && n>0; i++) {
+      if (bits[i]!=0) {
+        output->writeVInt(i-last);
+        output->writeByte(bits[i]);
+        last = i;
+        n -= BYTE_COUNTS[bits[i] & 0xFF];
+      }
+    }
+  }
+
+  /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+  bool BitSet::isSparse() {
+    // note: order of comparisons below set to favor smaller values (no binary range search.)
+    // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
+    // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
+    //       multiplying count by (8+8) or (8+16) or (8+24) etc.:
+    //       - first 8 for writing bits[i] (1 byte vs. 1 bit), and
+    //       - second part for writing the byte-number d-gap as vint.
+    // note: factor is for read/write of byte-arrays being faster than vints.
+    int32_t factor = 10;
+    if ((_size >> 3) < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
+    if ((_size >> 3) < (1<<14)) return factor * (4 + (8+16)*count()) < size();
+    if ((_size >> 3) < (1<<21)) return factor * (4 + (8+24)*count()) < size();
+    if ((_size >> 3) < (1<<28)) return factor * (4 + (8+32)*count()) < size();
+    return                            factor * (4 + (8+40)*count()) < size();
+  }
 CL_NS_END

Modified: branches/lucene2_3_2/src/core/CLucene/util/BitSet.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/util/BitSet.h	2009-05-04 14:46:40 UTC (rev 3006)
+++ branches/lucene2_3_2/src/core/CLucene/util/BitSet.h	2009-06-11 16:18:14 UTC (rev 3007)
@@ -9,13 +9,36 @@
 
 
 CL_CLASS_DEF(store,Directory)
+CL_CLASS_DEF(store,IndexInput)
+CL_CLASS_DEF(store,IndexOutput)
 
 CL_NS_DEF(util)
+
+
+/** Optimized implementation of a vector of bits.  This is more-or-less like
+  java.util.BitSet, but also includes the following:
+  <ul>
+  <li>a count() method, which efficiently computes the number of one bits;</li>
+  <li>optimized read from and write to disk;</li>
+  <li>inlinable get() method;</li>
+  <li>store and load, as bit set or d-gaps, depending on sparseness;</li> 
+  </ul>
+  */
 class CLUCENE_EXPORT BitSet:LUCENE_BASE {
 	int32_t _size;
 	int32_t _count;
 	uint8_t *bits;
-	
+
+  void readBits(CL_NS(store)::IndexInput* input);
+  /** read as a d-gaps list */
+  void readDgaps(CL_NS(store)::IndexInput* input);
+  /** Write as a bit set */
+  void writeBits(CL_NS(store)::IndexOutput* output);
+  /** Write as a d-gaps list */
+  void writeDgaps(CL_NS(store)::IndexOutput* output);
+  /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+  bool isSparse();
+  static const uint8_t BYTE_COUNTS[256];
 protected:
 	BitSet( const BitSet& copy );
 
@@ -29,7 +52,11 @@
 	~BitSet();
 	
 	///get the value of the specified bit
+	///get the value of the specified bit
 	inline bool get(const int32_t bit) const{
+		if (bit >= _size) {
+		_CLTHROWA(CL_ERR_IndexOutOfBounds, "bit out of range");
+		}
 		return (bits[bit >> 3] & (1 << (bit & 7))) != 0;
 	}
 	


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.