|
From: <ust...@us...> - 2009-06-11 16:18:19
|
Revision: 3007
http://clucene.svn.sourceforge.net/clucene/?rev=3007&view=rev
Author: ustramooner
Date: 2009-06-11 16:18:14 +0000 (Thu, 11 Jun 2009)
Log Message:
-----------
dgap bitvectors
Modified Paths:
--------------
branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp
branches/lucene2_3_2/src/core/CLucene/util/BitSet.h
Modified: branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp 2009-05-04 14:46:40 UTC (rev 3006)
+++ branches/lucene2_3_2/src/core/CLucene/util/BitSet.cpp 2009-06-11 16:18:14 UTC (rev 3007)
@@ -13,6 +13,25 @@
CL_NS_USE(store)
CL_NS_DEF(util)
+
+const uint8_t BitSet::BYTE_COUNTS[256] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
BitSet::BitSet( const BitSet& copy ) :
_size( copy._size ),
_count(-1)
@@ -36,11 +55,12 @@
_count=-1;
CL_NS(store)::IndexInput* input = d->openInput( name );
try {
- _size = input->readInt(); // read size
- _count = input->readInt(); // read count
-
- bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1); // allocate bits
- input->readBytes(bits, (_size >> 3) + 1); // read bits
+ _size = input->readInt(); // read size
+ if (_size == -1) {
+ readDgaps(input);
+ } else {
+ readBits(input);
+ }
} _CLFINALLY (
input->close();
_CLDELETE(input );
@@ -50,9 +70,11 @@
void BitSet::write(CL_NS(store)::Directory* d, const char* name) {
CL_NS(store)::IndexOutput* output = d->createOutput(name);
try {
- output->writeInt(size()); // write size
- output->writeInt(count()); // write count
- output->writeBytes(bits, (_size >> 3) + 1); // write bits
+ if (isSparse()) {
+ writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps.
+ } else {
+ writeBits(output);
+ }
} _CLFINALLY (
output->close();
_CLDELETE(output);
@@ -62,7 +84,12 @@
_CLDELETE_ARRAY(bits);
}
+
void BitSet::set(const int32_t bit, bool val){
+ if (bit >= _size) {
+ _CLTHROWA(CL_ERR_IndexOutOfBounds, "bit out of range");
+ }
+
_count = -1;
if (val)
@@ -78,23 +105,6 @@
int32_t BitSet::count(){
// if the BitSet has been modified
if (_count == -1) {
- static const uint8_t BYTE_COUNTS[] = {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
int32_t c = 0;
int32_t end = (_size >> 3) + 1;
@@ -108,4 +118,66 @@
return _CLNEW BitSet( *this );
}
+ /** Read as a bit set */
+ void BitSet::readBits(IndexInput* input) {
+ _count = input->readInt(); // read count
+ bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1); // allocate bits
+ input->readBytes(bits, (_size >> 3) + 1); // read bits
+ }
+
+ /** read as a d-gaps list */
+ void BitSet::readDgaps(IndexInput* input) {
+ _size = input->readInt(); // (re)read size
+ _count = input->readInt(); // read count
+ bits = _CL_NEWARRAY(uint8_t,(_size >> 3) + 1); // allocate bits
+ int32_t last=0;
+ int32_t n = count();
+ while (n>0) {
+ last += input->readVInt();
+ bits[last] = input->readByte();
+ n -= BYTE_COUNTS[bits[last] & 0xFF];
+ }
+ }
+
+ /** Write as a bit set */
+ void BitSet::writeBits(IndexOutput* output) {
+ output->writeInt(size()); // write size
+ output->writeInt(count()); // write count
+ output->writeBytes(bits, (_size >> 3) + 1); // write bits
+ }
+
+ /** Write as a d-gaps list */
+ void BitSet::writeDgaps(IndexOutput* output) {
+ output->writeInt(-1); // mark using d-gaps
+ output->writeInt(size()); // write size
+ output->writeInt(count()); // write count
+ int32_t last=0;
+ int32_t n = count();
+ int32_t m = (_size >> 3);
+ for (int32_t i=0; i<m && n>0; i++) {
+ if (bits[i]!=0) {
+ output->writeVInt(i-last);
+ output->writeByte(bits[i]);
+ last = i;
+ n -= BYTE_COUNTS[bits[i] & 0xFF];
+ }
+ }
+ }
+
+ /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+ bool BitSet::isSparse() {
+ // note: order of comparisons below set to favor smaller values (no binary range search.)
+ // note: adding 4 because we start with ((int) -1) to indicate d-gaps format.
+ // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore
+ // multiplying count by (8+8) or (8+16) or (8+24) etc.:
+ // - first 8 for writing bits[i] (1 byte vs. 1 bit), and
+ // - second part for writing the byte-number d-gap as vint.
+ // note: factor is for read/write of byte-arrays being faster than vints.
+ int32_t factor = 10;
+ if ((_size >> 3) < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size();
+ if ((_size >> 3) < (1<<14)) return factor * (4 + (8+16)*count()) < size();
+ if ((_size >> 3) < (1<<21)) return factor * (4 + (8+24)*count()) < size();
+ if ((_size >> 3) < (1<<28)) return factor * (4 + (8+32)*count()) < size();
+ return factor * (4 + (8+40)*count()) < size();
+ }
CL_NS_END
Modified: branches/lucene2_3_2/src/core/CLucene/util/BitSet.h
===================================================================
--- branches/lucene2_3_2/src/core/CLucene/util/BitSet.h 2009-05-04 14:46:40 UTC (rev 3006)
+++ branches/lucene2_3_2/src/core/CLucene/util/BitSet.h 2009-06-11 16:18:14 UTC (rev 3007)
@@ -9,13 +9,36 @@
CL_CLASS_DEF(store,Directory)
+CL_CLASS_DEF(store,IndexInput)
+CL_CLASS_DEF(store,IndexOutput)
CL_NS_DEF(util)
+
+
+/** Optimized implementation of a vector of bits. This is more-or-less like
+ java.util.BitSet, but also includes the following:
+ <ul>
+ <li>a count() method, which efficiently computes the number of one bits;</li>
+ <li>optimized read from and write to disk;</li>
+ <li>inlinable get() method;</li>
+ <li>store and load, as bit set or d-gaps, depending on sparseness;</li>
+ </ul>
+ */
class CLUCENE_EXPORT BitSet:LUCENE_BASE {
int32_t _size;
int32_t _count;
uint8_t *bits;
-
+
+ void readBits(CL_NS(store)::IndexInput* input);
+ /** read as a d-gaps list */
+ void readDgaps(CL_NS(store)::IndexInput* input);
+ /** Write as a bit set */
+ void writeBits(CL_NS(store)::IndexOutput* output);
+ /** Write as a d-gaps list */
+ void writeDgaps(CL_NS(store)::IndexOutput* output);
+ /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+ bool isSparse();
+ static const uint8_t BYTE_COUNTS[256];
protected:
BitSet( const BitSet& copy );
@@ -29,7 +52,11 @@
~BitSet();
///get the value of the specified bit
+ ///get the value of the specified bit
inline bool get(const int32_t bit) const{
+ if (bit >= _size) {
+ _CLTHROWA(CL_ERR_IndexOutOfBounds, "bit out of range");
+ }
return (bits[bit >> 3] & (1 << (bit & 7))) != 0;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|