[Bigdata-commit] SF.net SVN: bigdata:[4477] branches/QUADS_QUERY_BRANCH/bigdata/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4477
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4477&view=rev
Author:   thompsonbry
Date:     2011-05-10 19:05:34 +0000 (Tue, 10 May 2011)

Log Message:
-----------
Added HTree support for lookupFirst() and lookupAll(), insert() with split of the bucket (but not yet introduction of a new directory level), and contains() along with basic unit tests for the same. 

Next steps will be:

1. A split of a bucket page consisting of a single buddy bucket (globalDepth==localDepth).  This requires the introduction of a new directory page.

2. Remove.

3. More detailed tests, stress tests, etc.

Note that the data structure does not yet support persistence.  I am working to get the core logic for the HTree maintence correct and tested first.  I can then bring in the additional infrastructure to support incremental writes, checkpoint records, etc.  Much of this is already stubbed out, but not yet implemented.  

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractBTreeTupleCursor.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractNode.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractTuple.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/PO.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/AbstractHTree.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/HTree.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/HTreeUtil.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/MutableDirectoryPageData.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/htree/TestHTree.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractBTreeTupleCursor.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractBTreeTupleCursor.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractBTreeTupleCursor.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -1492,7 +1492,7 @@
 
         }
 
-        public Tuple<E> get(Tuple<E> tuple) {
+        public Tuple<E> get(final Tuple<E> tuple) {
 
             relocateLeaf();
             

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractNode.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractNode.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractNode.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -163,7 +163,7 @@
      * Return the delegate {@link IAbstractNodeData} object.
      */
     abstract IAbstractNodeData getDelegate();
-    
+
     public void delete() {
         
         if( deleted ) {
@@ -1300,29 +1300,4 @@
      */
     abstract public boolean dump(Level level, PrintStream out, int height, boolean recursive);
 
-    /**
-     * Returns a string that may be used to indent a dump of the nodes in
-     * the tree.
-     * 
-     * @param height
-     *            The height.
-     *            
-     * @return A string suitable for indent at that height.
-     */
-    protected static String indent(final int height) {
-
-        if( height == -1 ) {
-        
-            // The height is not defined.
-            
-            return "";
-            
-        }
-        
-        return ws.substring(0, height * 4);
-
-    }
-
-    private static final transient String ws = "                                                                                                                                                                                                                  ";
-
 }

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractTuple.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractTuple.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/AbstractTuple.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -31,6 +31,7 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 
+import com.bigdata.btree.data.ILeafData;
 import com.bigdata.io.ByteArrayBuffer;
 import com.bigdata.io.DataInputBuffer;
 import com.bigdata.io.DataOutputBuffer;
@@ -341,7 +342,7 @@
 	 *       and {@link ITuple#getSourceIndex()} should be implemented by this
 	 *       class (or maybe add a setSourceIndex() to be more flexible).
 	 */
-    public void copy(final int index, final Leaf leaf) {
+    public void copy(final int index, final ILeafData leaf) {
         
         nvisited++;
         
@@ -385,8 +386,28 @@
 
 						} else {
 
+							if(!(leaf instanceof Leaf)) {
+								
+								/*
+								 * TODO Raw record support for the HTree will
+								 * require an API shared by Leaf and BucketPage
+								 * which reaches back to the owning index and
+								 * its readRawRecord() method.  When doing this,
+								 * it would be best if there were a common base
+								 * or interface for the HTree buckets and BTree
+								 * leaves such that no casts are required, but
+								 * that is not essential. [The argument to the
+								 * method used to be Leaf, not ILeafData, but
+								 * that means that we could not invoke this
+								 * method for a BucketPage.]
+								 */
+
+								throw new UnsupportedOperationException();
+								
+							}
+							
 							// materialize from the backing store.
-							final ByteBuffer tmp = leaf.btree
+							final ByteBuffer tmp = ((Leaf)leaf).btree
 									.readRawRecord(addr);
 
 							// and copy into the value buffer.

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/PO.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/PO.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/PO.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -181,4 +181,30 @@
         }
 
     }
+
+    /**
+     * Returns a string that may be used to indent a dump of the nodes in
+     * the tree.
+     * 
+     * @param height
+     *            The height.
+     *            
+     * @return A string suitable for indent at that height.
+     */
+    protected static String indent(final int height) {
+
+        if( height == -1 ) {
+        
+            // The height is not defined.
+            
+            return "";
+            
+        }
+        
+        return ws.substring(0, height * 4);
+
+    }
+
+    private static final transient String ws = "                                                                                                                                                                                                                  ";
+
 }

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/AbstractHTree.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/AbstractHTree.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/AbstractHTree.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -1,11 +1,13 @@
 package com.bigdata.htree;
 
+import java.io.PrintStream;
 import java.lang.ref.Reference;
 import java.lang.ref.SoftReference;
 import java.lang.ref.WeakReference;
 import java.nio.ByteBuffer;
 import java.util.HashMap;
 
+import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 
 import com.bigdata.Banner;
@@ -29,6 +31,7 @@
 import com.bigdata.cache.IHardReferenceQueue;
 import com.bigdata.cache.RingBuffer;
 import com.bigdata.htree.HTree.AbstractPage;
+import com.bigdata.htree.HTree.BucketPage;
 import com.bigdata.htree.HTree.DirectoryPage;
 import com.bigdata.rawstore.IRawStore;
 import com.bigdata.resources.IndexManager;
@@ -127,16 +130,16 @@
 	 */
     protected final int addressBits;
 
-	/**
-	 * The #of entries in a directory bucket, which is 2^{@link #addressBits}
-	 * (aka <code>1<<addressBits</code>).
-	 */
-    protected final int branchingFactor;
+//	/**
+//	 * The #of entries in a directory bucket, which is 2^{@link #addressBits}
+//	 * (aka <code>1<<addressBits</code>).
+//	 */
+//    protected final int branchingFactor;
     
     /**
-     * The root node.
+     * The root directory.
      */
-    protected volatile AbstractPage root;
+    protected volatile DirectoryPage root;
 
     /**
      * Nodes (that is nodes or leaves) are added to a hard reference queue when
@@ -266,7 +269,24 @@
     	return addressBits;
     }
     
+	/**
+	 * The #of {@link DirectoryPage}s in the {@link HTree} (not buddy hash
+	 * tables, but the pages on which they appear).
+	 */
+	abstract public int getNodeCount();
+
+	/**
+	 * The #of {@link BucketPage}s in the {@link HTree} (not buddy hash buckets,
+	 * but the pages on which they appear).
+	 */
+    abstract public int getLeafCount();
+    
     /**
+     * The #of tuples in the {@link HTree}.
+     */
+    abstract public int getEntryCount();
+    
+    /**
 	 * @param store
 	 *            The persistence store.
 	 * @param nodeFactory
@@ -317,7 +337,7 @@
         
         this.store = store;
         this.addressBits = addressBits;
-        this.branchingFactor = 1 << addressBits;
+//        this.branchingFactor = 1 << addressBits;
 
 //        /*
 //         * Compute the minimum #of children/values. This is the same whether
@@ -474,6 +494,51 @@
 
     }
 
+	/**
+	 * Recursive dump of the tree.
+	 * 
+	 * @param out
+	 *            The dump is written on this stream.
+	 * 
+	 * @return true unless an inconsistency is detected.
+	 */
+	public boolean dump(final PrintStream out) {
+
+		return dump(BTree.dumpLog.getEffectiveLevel(), out, false/* materialize */);
+
+    }
+
+	public boolean dump(final Level level, final PrintStream out,
+			final boolean materialize) {
+
+        // True iff we will write out the node structure.
+        final boolean info = level.toInt() <= Level.INFO.toInt();
+
+//        final IBTreeUtilizationReport utils = getUtilization();
+
+        if (info) {
+
+			out.print("addressBits=" + addressBits);
+			out.print(", (2^addressBits)=" + (1 << addressBits));
+			out.print(", #nodes=" + getNodeCount());
+			out.print(", #leaves=" + getLeafCount());
+			out.print(", #entries=" + getEntryCount());
+			out.println();
+//                    + ", nodeUtil="
+//                    + utils.getNodeUtilization() + "%, leafUtil="
+//                    + utils.getLeafUtilization() + "%, utilization="
+//                    + utils.getTotalUtilization() + "%"
+        }
+
+        if (root != null) {
+
+            return root.dump(level, out, 0, true, materialize);
+
+        } else
+            return true;
+
+    }
+
     /**
      * <p>
      * This method is responsible for putting the node or leaf onto the ring
@@ -494,7 +559,7 @@
      * </p>
      * <p>
      * In conjunction with {@link DefaultEvictionListener}, this method
-     * guarentees that the reference counter for the node will reflect the #of
+     * guarantees that the reference counter for the node will reflect the #of
      * times that the node is actually present on the
      * {@link #writeRetentionQueue}.
      * </p>

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/HTree.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/HTree.java	2011-05-10 16:25:02 UTC (rev 4476)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/htree/HTree.java	2011-05-10 19:05:34 UTC (rev 4477)
@@ -27,20 +27,33 @@
 
 package com.bigdata.htree;
 
+import java.io.PrintStream;
 import java.lang.ref.Reference;
+import java.util.HashSet;
+import java.util.NoSuchElementException;
+import java.util.Set;
 
+import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 
+import com.bigdata.btree.AbstractTuple;
 import com.bigdata.btree.BTree;
 import com.bigdata.btree.BytesUtil;
-import com.bigdata.btree.ISimpleBTree;
+import com.bigdata.btree.IRangeQuery;
+import com.bigdata.btree.ITuple;
+import com.bigdata.btree.ITupleIterator;
+import com.bigdata.btree.ITupleSerializer;
 import com.bigdata.btree.Leaf;
+import com.bigdata.btree.LeafTupleIterator;
 import com.bigdata.btree.MutableLeafData;
 import com.bigdata.btree.Node;
 import com.bigdata.btree.PO;
+import com.bigdata.btree.data.DefaultLeafCoder;
 import com.bigdata.btree.data.IAbstractNodeData;
 import com.bigdata.btree.data.ILeafData;
 import com.bigdata.btree.raba.IRaba;
+import com.bigdata.btree.raba.MutableKeyBuffer;
+import com.bigdata.btree.raba.MutableValueBuffer;
 import com.bigdata.cache.HardReferenceQueue;
 import com.bigdata.htree.data.IDirectoryData;
 import com.bigdata.io.AbstractFixedByteArrayBuffer;
@@ -64,9 +77,9 @@
  *          int32 keys.
  */
 public class HTree extends AbstractHTree 
-	implements 
+//	implements 
 //	IIndex, 
-	ISimpleBTree//, IAutoboxBTree, ILinearList, IBTreeStatistics, ILocalBTreeView
+//  ISimpleBTree//, IAutoboxBTree, ILinearList, IBTreeStatistics, ILocalBTreeView
 {
 
     private static final transient Logger log = Logger.getLogger(HTree.class);
@@ -80,20 +93,54 @@
     private final boolean deleteMarkers;
     private final boolean rawRecords;
 
+	/**
+	 * The #of {@link DirectoryPage} in the {@link HTree}. This is ONE (1) for a
+	 * new {@link HTree}.
+	 */
+	protected int nnodes;
+
+	/**
+	 * The #of {@link BucketPage}s in the {@link HTree}. This is one (1) for a
+	 * new {@link HTree} (one directory page and one bucket page).
+	 */
+	protected int nleaves;
+
+	/**
+	 * The #of entries in the {@link HTree}. This is ZERO (0) for a new
+	 * {@link HTree}.
+	 */
+	protected int nentries;
+    
+    final public int getNodeCount() {
+        
+        return nnodes;
+        
+    }
+
+    final public int getLeafCount() {
+        
+        return nleaves;
+        
+    }
+
+    final public int getEntryCount() {
+        
+        return nentries;
+        
+    }
+
 	public boolean isReadOnly() {
 		return false; // TODO set by ctor.
 	}
 
-    /**
-     * The root of the btree. This is initially a leaf until the leaf is split,
-     * at which point it is replaced by a node. The root is also replaced each
-     * time copy-on-write triggers a cascade of updates.
-     * <p>
-     * The hard reference to the root node is cleared if the index is
-     * {@link #close() closed}. This method automatically {@link #reopen()}s
-     * the index if it is closed, making it available for use.
-     */
-    final protected AbstractPage getRoot() {
+	/**
+	 * The root of the {@link HTree}. This is always a {@link DirectoryPage}.
+	 * <p>
+	 * The hard reference to the root node is cleared if the index is
+	 * {@link #close() closed}. This method automatically {@link #reopen()}s the
+	 * index if it is closed, making it available for use.
+	 */
+    final protected DirectoryPage getRoot() {
 
         // make sure that the root is defined.
         if (root == null)
@@ -169,6 +216,7 @@
 				this,// the owning htree instance
 				addressBits // the global depth of the root.
 				);
+		nnodes++;
 		assert r.getSlotsPerBuddy() == 1;
 		assert r.getNumBuddies() == (1 << addressBits);
 
@@ -177,13 +225,17 @@
 
         // Initial bucket.
 		final BucketPage b = new BucketPage(this, 0/* globalDepth */);
+		nleaves++;
 
 		final int nslots = r.getSlotsPerBuddy() * r.getNumBuddies();
 
 		for (int i = 0; i < nslots; i++) {
 		
-			r.refs[i] = (Reference)b.self;
-			rdata.childAddr[i] = 0L; // TODO addr(b) plus ref of [b].
+			b.parent = (Reference) r.self; // TODO who has responsibility to set the parent reference?
+
+			r.childRefs[i] = (Reference) b.self;
+
+			rdata.childAddr[i] = 0L;
 			
 		}
 		
@@ -204,49 +256,562 @@
 	 * 
 	 * TODO The hash tree will normally be used with int32 keys, and those keys
 	 * will typically be obtained from Object#hashCode(). Special methods should
-	 * be provided for this purpose. E.g., ISimpleHTree.  Custom serialization
+	 * be provided for this purpose. E.g., ISimpleHTree. Custom serialization
 	 * and even a custom mutable leaf data record should be used with this case.
+	 * 
+	 * TODO insert() return for htree is never old value since always adds() a
+	 * new tuple never replaces() an existing tuple.  Further, if the key and
+	 * val are equals() then we might even define the semantics as a NOP rather
+	 * than appending a duplicate item into the bucket (it's worth thinking on
+	 * this further as scanning for equals is more work and sometimes we might
+	 * want to allow fully duplicated items into the bucket.).
 	 */
     
-	public boolean contains(byte[] key) {
-		// TODO Auto-generated method stub
-		return false;
-	}
-
-    public void insert(final Object obj) {
-        insert(obj.hashCode(), SerializerUtil.serialize(obj));
+    /**
+     * Convert an int32 hash code key into an <code>unsigned byte[4]</code>.
+     */
+    private byte[] i2k(final int key) {
+		final byte[] buf = new byte[4];
+		buf[0] = (byte) (key >>> 24);
+		buf[1] = (byte) (key >>> 16);
+		buf[2] = (byte) (key >>> 8);
+		buf[3] = (byte) (key >>> 0);
+		return buf;
     }
 
-    public void insert(final int key, final byte[] val) {
-    	// TODO code path for native int32 keys.
+	// TODO contains with an Object clear needs to test for the Object, not just
+	// the key.  Maybe do this as a wrapper similar to BigdataMap?
+    public boolean contains(final Object obj) {
+        //contains(obj.hashCode());
     	throw new UnsupportedOperationException();
     }
 
-	public byte[] insert(byte[] key, byte[] value) {
-		// TODO Auto-generated method stub
-		return null;
+	public boolean contains(final int key) {
+		return contains(i2k(key));
 	}
 
-    public void lookup(final Object obj) {
-        lookup(obj.hashCode());
+	/**
+	 * Return <code>true</code> iff there is at least one tuple in the hash tree
+	 * having the specified <i>key</i>.
+	 * 
+	 * @param key
+	 *            The key.
+	 * @return <code>true</code> iff the hash tree contains at least one tuple
+	 *         with that <i>key</i>.
+	 * @throws IllegalArgumentException
+	 *             if the <i>key</i> is <code>null</code>.
+	 * 
+	 *             TODO Parallel to insert(), consider a contains() signature
+	 *             which permits testing for a specific value as well. Maybe
+	 *             in a wrapper class?
+	 */
+	public boolean contains(final byte[] key) {
+		if (key == null)
+			throw new IllegalArgumentException();
+		DirectoryPage current = getRoot(); // start at the root.
+		int prefixLength = 0;// prefix length of the root is always zero.
+		int buddyOffset = 0; // buddyOffset of the root is always zero.
+		while (true) {
+			// skip prefixLength bits and then extract globalDepth bits. 
+			final int hashBits = current.getLocalHashCode(key, prefixLength);
+			// find the child directory page or bucket page.
+			final AbstractPage child = current.getChild(hashBits, buddyOffset);
+			if (child.isLeaf()) {
+				/*
+				 * Found the bucket page, update it.
+				 */
+				final BucketPage bucketPage = (BucketPage) child;
+				if(!bucketPage.contains(key, buddyOffset)) {
+					return false;
+				}
+				return true;
+			}
+			/*
+			 * Recursive descent into a child directory page. We have to update
+			 * the prefixLength and compute the offset of the buddy hash table
+			 * within the child before descending into the child.
+			 */
+			prefixLength = prefixLength + child.globalDepth;
+			buddyOffset = HTreeUtil
+					.getBuddyOffset(hashBits, current.globalDepth,
+							child.globalDepth/* localDepthOfChild */);
+			current = (DirectoryPage) child;
+		}
+	}
+
+//    public void lookup(final Object obj) {
+//        lookup(obj.hashCode());
+//    }
+
+	/**
+	 * Return the first value for the key.
+	 * 
+	 * @param key
+	 *            The key.
+	 * 
+	 * @return The first value for the key -or- <code>null</code> if there are
+	 *         no tuples in the index having that key. Note that the return
+	 *         value is not diagnostic if the application allows
+	 *         <code>null</code> values into the index.
+	 */
+    public byte[] lookupFirst(final int key) {
+    	return lookupFirst(i2k(key));
     }
 
-    public void lookup(final int key) {
-    	// TODO code path for native int32 keys.
-    	throw new UnsupportedOperationException();
+	/**
+	 * Return the first value for the key.
+	 * 
+	 * @param key
+	 *            The key.
+	 * 
+	 * @return The first value for the key -or- <code>null</code> if there are
+	 *         no tuples in the index having that key. Note that the return
+	 *         value is not diagnostic if the application allows
+	 *         <code>null</code> values into the index.
+	 */
+	public byte[] lookupFirst(final byte[] key) {
+		if (key == null)
+			throw new IllegalArgumentException();
+		DirectoryPage current = getRoot(); // start at the root.
+		int prefixLength = 0;// prefix length of the root is always zero.
+		int buddyOffset = 0; // buddyOffset of the root is always zero.
+		while (true) {
+			// skip prefixLength bits and then extract globalDepth bits. 
+			final int hashBits = current.getLocalHashCode(key, prefixLength);
+			// find the child directory page or bucket page.
+			final AbstractPage child = current.getChild(hashBits, buddyOffset);
+			if (child.isLeaf()) {
+				/*
+				 * Found the bucket page, search it for a match.
+				 */
+				final BucketPage bucketPage = (BucketPage) child;
+				return bucketPage.lookupFirst(key, buddyOffset);
+			}
+			/*
+			 * Recursive descent into a child directory page. We have to update
+			 * the prefixLength and compute the offset of the buddy hash table
+			 * within the child before descending into the child.
+			 */
+			prefixLength = prefixLength + child.globalDepth;
+			buddyOffset = HTreeUtil
+					.getBuddyOffset(hashBits, current.globalDepth,
+							child.globalDepth/* localDepthOfChild */);
+			current = (DirectoryPage) child;
+		}
+	}
+
+	/**
+	 * Return an iterator which will visit each tuple in the index having the
+	 * specified key.
+	 * 
+	 * @param key
+	 *            The key.
+	 * 
+	 * @return The iterator and never <code>null</code>.
+	 */
+	public ITupleIterator lookupAll(final int key) {
+		return lookupAll(i2k(key));
+	}
+
+	/**
+	 * Return an iterator which will visit each tuple in the index having the
+	 * specified key.
+	 * 
+	 * @param key
+	 *            The key.
+	 * 
+	 * @return The iterator and never <code>null</code>.
+	 */
+	public ITupleIterator lookupAll(final byte[] key) {
+		if (key == null)
+			throw new IllegalArgumentException();
+		DirectoryPage current = getRoot(); // start at the root.
+		int prefixLength = 0;// prefix length of the root is always zero.
+		int buddyOffset = 0; // buddyOffset of the root is always zero.
+		while (true) {
+			// skip prefixLength bits and then extract globalDepth bits. 
+			final int hashBits = current.getLocalHashCode(key, prefixLength);
+			// find the child directory page or bucket page.
+			final AbstractPage child = current.getChild(hashBits, buddyOffset);
+			if (child.isLeaf()) {
+				/*
+				 * Found the bucket page, search it for a match.
+				 */
+				final BucketPage bucketPage = (BucketPage) child;
+				return bucketPage.lookupAll(key, buddyOffset);
+			}
+			/*
+			 * Recursive descent into a child directory page. We have to update
+			 * the prefixLength and compute the offset of the buddy hash table
+			 * within the child before descending into the child.
+			 */
+			prefixLength = prefixLength + child.globalDepth;
+			buddyOffset = HTreeUtil
+					.getBuddyOffset(hashBits, current.globalDepth,
+							child.globalDepth/* localDepthOfChild */);
+			current = (DirectoryPage) child;
+		}
+	}
+	
+    public void insert(final Object obj) {
+        insert(obj.hashCode(), SerializerUtil.serialize(obj));
     }
 
-	public byte[] lookup(byte[] key) {
-		// TODO Return 1st match or null iff not found.
-		return null;
+	public void insert(final int key, final byte[] val) {
+		insert(i2k(key), val);
 	}
 
-	public byte[] remove(byte[] key) {
+	/**
+	 * Insert a tuple into the hash tree. Tuples with duplicate keys and even
+	 * tuples with duplicate keys and values are allowed and will result in
+	 * multiple tuples.
+	 * 
+	 * @param key
+	 *            The key.
+	 * @param value
+	 *            The value.
+	 * @return <code>null</code> (always).
+	 * 
+	 *         TODO If the application wants to restrict the hash tree to such
+	 *         that it does not contain duplicate tuples then it must first
+	 *         search in the tree for an exact match (key and value). It is
+	 *         easier to do that from within the insert logic so expand the
+	 *         method signature to pass an insert enum {ALLDUPS,DUPKEYS,NODUPS}.
+	 */
+	public byte[] insert(final byte[] key, final byte[] value) {
+		if (key == null)
+			throw new IllegalArgumentException();
+		DirectoryPage current = getRoot(); // start at the root.
+		int prefixLength = 0;// prefix length of the root is always zero.
+		int buddyOffset = 0; // buddyOffset of the root is always zero.
+		while (true) {
+			// skip prefixLength bits and then extract globalDepth bits. 
+			final int hashBits = current.getLocalHashCode(key, prefixLength);
+			// find the child directory page or bucket page.
+			final AbstractPage child = current.getChild(hashBits, buddyOffset);
+			if (child.isLeaf()) {
+				/*
+				 * Found the bucket page, update it.
+				 */
+				final BucketPage bucketPage = (BucketPage) child;
+				// attempt to insert the tuple into the bucket.
+				if(!bucketPage.insert(key, value, current, buddyOffset)) {
+
+					// TODO if(parent.isReadOnly()) parent = copyOnWrite();
+
+					if (current.globalDepth == child.globalDepth) {
+
+						/*
+						 * There is only one buddy hash bucket on the page. To
+						 * split the page, we have to introduce a new directory
+						 * page above it.
+						 * 
+						 * TODO Introduce new directory page if sole buddy
+						 * bucket is full.
+						 */
+						
+						throw new UnsupportedOperationException();
+						
+					}
+
+					// globalDepth >= localDepth
+					splitBucketsOnPage(current, buddyOffset, bucketPage);
+					
+					// Try again. The children have changed.
+					continue;
+				}
+				return null; // TODO should be Void return? or depends on enum controlling dups behavior?
+			}
+			/*
+			 * Recursive descent into a child directory page. We have to update
+			 * the prefixLength and compute the offset of the buddy hash table
+			 * within the child before descending into the child.
+			 */
+			prefixLength = prefixLength + child.globalDepth;
+			buddyOffset = HTreeUtil
+					.getBuddyOffset(hashBits, current.globalDepth,
+							child.globalDepth/* localDepthOfChild */);
+			current = (DirectoryPage) child;
+		}
+	}
+
+	public byte[] remove(final byte[] key) {
 		// TODO Remove 1st match, returning value.
-		return null;
+		throw new UnsupportedOperationException();
 	}
 
 	/**
+	 * Handle split if buddy bucket is full but localDepth LT globalDepth (so
+	 * there is more than one buddy bucket on the page). This will allocate a
+	 * new bucket page; update the references in the parent, and then
+	 * redistribute buddy buckets among the old and new bucket page. The depth
+	 * of the child will be increased by one. As a post-condition, the depth of
+	 * the new child will be the same as the then current depth of the original
+	 * child. Note that this doubles the size of each buddy bucket, thus always
+	 * creating room for additional tuples.
+	 * 
+	 * @param parent
+	 *            The parent {@link DirectoryPage}.
+	 * @param buddyOffset
+	 *            The buddyOffset within the <i>parent</i>. This identifies
+	 *            which buddy hash table in the parent must be its pointers
+	 *            updated such that it points to both the original child and new
+	 *            child.
+	 * @param oldBucket
+	 *            The child {@link BucketPage}.
+	 * 
+	 * @throws IllegalArgumentException
+	 *             if any argument is <code>null</code>.
+	 * @throws IllegalStateException
+	 *             if the depth of the child is GTE the depth of the parent.
+	 * @throws IllegalStateException
+	 *             if the <i>parent<i/> is read-only.
+	 * @throws IllegalStateException
+	 *             if the <i>oldBucket</i> is read-only.
+	 * @throws IllegalStateException
+	 *             if the parent of the <oldBucket</i> is not the given
+	 *             <i>parent</i>.
+	 */
+	private void splitBucketsOnPage(final DirectoryPage parent,
+			final int buddyOffset, final BucketPage oldBucket) {
+
+		if (parent == null)
+			throw new IllegalArgumentException();
+		if (oldBucket == null)
+			throw new IllegalArgumentException();
+		if (oldBucket.globalDepth >= parent.globalDepth) {
+			// In this case we have to introduce a new directory page instead.
+			throw new IllegalStateException();
+		}
+		if (buddyOffset < 0)
+			throw new IllegalArgumentException();
+		if (buddyOffset >= (1 << addressBits)) {
+			/*
+			 * Note: This check is against the maximum possible slot index. The
+			 * actual max buddyOffset depends on parent.globalBits also since
+			 * (1<<parent.globalBits) gives the #of slots per buddy and the
+			 * allowable buddyOffset values must fall on an buddy hash table
+			 * boundary.
+			 */
+			throw new IllegalArgumentException();
+		}
+		if(parent.isReadOnly()) // must be mutable.
+			throw new IllegalStateException();
+		if(oldBucket.isReadOnly()) // must be mutable.
+			throw new IllegalStateException();
+		if (oldBucket.parent != parent.self) // must be same Reference.
+			throw new IllegalStateException();
+		
+		final int oldDepth = oldBucket.globalDepth;
+		final int newDepth = oldDepth + 1;
+
+		// Allocate a new bucket page (globalDepth is increased by one).
+		final BucketPage newBucket = new BucketPage(this, oldBucket.globalDepth + 1);
+
+		assert newBucket.isDirty();
+		
+		// Set the parent reference on the new bucket.
+		newBucket.parent = (Reference) parent.self;
+		
+		// Increase global depth on the old page also.
+		oldBucket.globalDepth++;
+
+		nleaves++; // One more bucket page in the hash tree. 
+
+		// #of slots on the bucket page (invariant given addressBits).
+		final int slotsOnPage = (1 << addressBits);
+
+		/*
+		 * Update pointers in buddy hash table in the parent.
+		 * 
+		 * Note: The upper 1/2 of the pointers in the buddy hash table in the
+		 * parent are rewritten to point to the new child.
+		 */
+		{
+
+			// #of address slots in the parent buddy hash table.
+			final int slotsPerBuddy = (1 << parent.globalDepth);
+
+			// Must be at least two slots per buddy for us to redistribute
+			// pointers.
+			assert slotsPerBuddy > 1 : "slotsPerBuddy=" + slotsPerBuddy;
+			
+			// The first slot in the upper 1/2 of the buddy hash table.
+			final int firstSlot = buddyOffset + (slotsPerBuddy >> 1);
+			
+			// The last slot in the buddy hash table.
+			final int lastSlot = buddyOffset + slotsPerBuddy;
+			
+			for (int i = firstSlot; i < lastSlot; i++) {
+
+				// update the references to the new bucket.
+				parent.childRefs[i] = (Reference) newBucket.self;
+				
+				// clear address since newBucket is dirty.
+				((MutableDirectoryPageData) parent.data).childAddr[i] = 0L;
+				
+			}
+			
+		}
+
+		/*
+		 * Redistribute the buddy buckets.
+		 * 
+		 * Note: We are not changing the #of buddy buckets, just their size and
+		 * the page on which they are found. Any tuples in a source bucket will
+		 * wind up in the same bucket afterwards, but the page and offset on the
+		 * page of the buddy bucket may have been changed.
+		 * 
+		 * TODO So we could proceed backwards, moving the upper half of the
+		 * buddy buckets to the new bucket page first and then spreading out the
+		 * lower half of the source page among the new bucket boundaries on the
+		 * page. Again, we have to move backwards through the buddy buckets on
+		 * the source page to avoid overwrites of data which has not yet been
+		 * copied.
+		 */
+		{
+
+			// #of address slots in each old buddy hash bucket.
+			final int slotsPerOldBuddy = (1 << oldDepth);
+
+			// #of address slots in each new buddy hash bucket.
+			final int slotsPerNewBuddy = (1 << newDepth);
+
+			// #of buddy tables on the old bucket page.
+			final int oldBuddyCount = (slotsOnPage) / slotsPerOldBuddy;
+
+			// #of buddy tables on the bucket pages after the split.
+			final int newBuddyCount = (slotsOnPage) / slotsPerNewBuddy;
+
+			final BucketPage srcPage = oldBucket;
+			final MutableKeyBuffer srcKeys = (MutableKeyBuffer)oldBucket.getKeys();
+			final MutableValueBuffer srcVals = (MutableValueBuffer)oldBucket.getValues();
+
+			/*
+			 * Move top 1/2 of the buddy buckets from the child to the new page.
+			 */
+			{
+
+				// target is the new page.
+				final BucketPage dstPage = newBucket;
+				final MutableKeyBuffer dstKeys = (MutableKeyBuffer)dstPage.getKeys();
+				final MutableValueBuffer dstVals = (MutableValueBuffer)dstPage.getValues();
+				
+				// index (vs offset) of first buddy in upper half of src page.
+				final int firstSrcBuddyIndex = (oldBuddyCount >> 1);
+
+				// exclusive upper bound for index (vs offset) of last buddy in
+				// upper half of src page.
+				final int lastSrcBuddyIndex = oldBuddyCount;
+
+				// exclusive upper bound for index (vs offset) of last buddy in
+				// upper half of target page.
+				final int lastDstBuddyIndex = newBuddyCount;
+
+				// work backwards over buddy buckets to avoid stomping data!
+				for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
+					 srcBuddyIndex >= firstSrcBuddyIndex; //
+					 srcBuddyIndex--, dstBuddyIndex--//
+					 ) {
+
+					final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;
+
+					final int lastSrcSlot = (srcBuddyIndex + 1)
+							* slotsPerOldBuddy;
+
+					final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;
+
+					for (int srcSlot = firstSrcSlot, dstSlot = firstDstSlot; srcSlot < lastSrcSlot; srcSlot++, dstSlot++) {
+
+						if (log.isTraceEnabled())
+							log.trace("moving: page(" + srcPage.toShortString()
+									+ "=>" + dstPage.toShortString() + ")"
+									+ ", buddyIndex(" + srcBuddyIndex + "=>"
+									+ dstBuddyIndex + ")" + ", slot(" + srcSlot
+									+ "=>" + dstSlot + ")");
+						
+						// Move the tuple at that slot TODO move metadata also.
+						dstKeys.keys[dstSlot] = srcKeys.keys[srcSlot];
+						dstVals.values[dstSlot] = srcVals.values[srcSlot];
+						srcKeys.keys[srcSlot] = null;
+						srcVals.values[srcSlot] = null;
+
+					}
+
+				}
+				
+			}
+
+			/*
+			 * Reposition the bottom 1/2 of the buddy buckets on the old page.
+			 */
+			{
+
+				// target is the old page.
+				final BucketPage dstPage = oldBucket;
+				final MutableKeyBuffer dstKeys = (MutableKeyBuffer)dstPage.getKeys();
+				final MutableValueBuffer dstVals = (MutableValueBuffer)dstPage.getValues();
+				
+				// index (vs offset) of first buddy in lower half of src page.
+				final int firstSrcBuddyIndex = 0;
+
+				// exclusive upper bound for index (vs offset) of last buddy in
+				// lower half of src page.
+				final int lastSrcBuddyIndex = (oldBuddyCount >> 1);
+
+				// exclusive upper bound for index (vs offset) of last buddy in
+				// upper half of target page (which is also the source page).
+				final int lastDstBuddyIndex = newBuddyCount;
+
+				/*
+				 * Work backwards over buddy buckets to avoid stomping data!
+				 * 
+				 * Note: The slots for first buddy in the lower half of the
+				 * source page DO NOT MOVE. The offset of that buddy in the
+				 * first page remains unchanged. Only the size of the buddy is
+				 * changed (it is doubled, just like all the other buddies on
+				 * the page).
+				 */
+				for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
+					 srcBuddyIndex > firstSrcBuddyIndex; //
+					 srcBuddyIndex--, dstBuddyIndex--//
+					 ) {
+
+					final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;
+
+					final int lastSrcSlot = (srcBuddyIndex + 1)
+							* slotsPerOldBuddy;
+
+					final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;
+
+					for (int srcSlot = firstSrcSlot, dstSlot = firstDstSlot; srcSlot < lastSrcSlot; srcSlot++, dstSlot++) {
+
+						if (log.isTraceEnabled())
+							log.trace("moving: page(" + srcPage.toShortString()
+									+ "=>" + dstPage.toShortString() + ")"
+									+ ", buddyIndex(" + srcBuddyIndex + "=>"
+									+ dstBuddyIndex + ")" + ", slot(" + srcSlot
+									+ "=>" + dstSlot + ")");
+
+						// Move the tuple at that slot TODO move metadata also.
+						dstKeys.keys[dstSlot] = srcKeys.keys[srcSlot];
+						dstVals.values[dstSlot] = srcVals.values[srcSlot];
+						srcKeys.keys[srcSlot] = null;
+						srcVals.values[srcSlot] = null;
+
+					}
+
+				}
+				
+			}
+
+		}
+
+		// TODO assert invariants?
+		
+	}
+	
+	/**
      * Persistence capable abstract base class for HTree pages.
      * 
      * @author thompsonbry
@@ -377,8 +942,9 @@
 
 		/**
 		 * Return the bits from the key which are relevant to the current
-		 * directory page. This depends on the <i>prefixLength</i> to be
-		 * ignored, the <i>globalDepth</i> of this directory page, and the key.
+		 * directory page (varient for unsigned byte[] keys). This depends on
+		 * the <i>prefixLength</i> to be ignored, the <i>globalDepth</i> of this
+		 * directory page, and the key.
 		 * 
 		 * @param key
 		 *            The key.
@@ -387,17 +953,41 @@
 		 *            this level of the hash tree. This is computed dynamically
 		 *            during recursive traversal of the hash tree. This is ZERO
 		 *            (0) for the root directory. It is incremented by
-		 *            <i>globalDepth</i> at each level of recursion for insert,
-		 *            lookup, etc.
+		 *            <i>globalDepth</i> (the #of address bits used by a given
+		 *            node) at each level of recursion for insert, lookup, etc.
 		 * 
 		 * @return The int32 value containing the relevant bits from the key.
 		 */
 		public int getLocalHashCode(final byte[] key, final int prefixLength) {
-		
+			
 			return BytesUtil.getBits(key, prefixLength, globalDepth);
 			
 		}
 
+		/**
+		 * Return the bits from the key which are relevant to the current
+		 * directory page (variant for int32 keys). This depends on the
+		 * <i>prefixLength</i> to be ignored, the <i>globalDepth</i> of this
+		 * directory page, and the key.
+		 * 
+		 * @param key
+		 *            The key.
+		 * @param prefixLength
+		 *            The #of MSB bits in the key which are to be ignored at
+		 *            this level of the hash tree. This is computed dynamically
+		 *            during recursive traversal of the hash tree. This is ZERO
+		 *            (0) for the root directory. It is incremented by
+		 *            <i>globalDepth</i> (the #of address bits used by a given
+		 *            node) at each level of recursion for insert, lookup, etc.
+		 * 
+		 * @return The int32 value containing the relevant bits from the key.
+		 */
+		public int getLocalHashCode(final int key, final int prefixLength) {
+			
+			return BytesUtil.getBits(key, prefixLength, globalDepth);
+			
+		}
+
         /**
          * Disallowed.
          */
@@ -517,6 +1107,29 @@
 			
 		}
 
+		/**
+		 * Dump the data onto the {@link PrintStream}.
+		 * 
+		 * @param level
+		 *            The logging level.
+		 * @param out
+		 *            Where to write the dump.
+		 * @param height
+		 *            The height of this node in the tree or -1 iff you need to
+		 *            invoke this method on a node or leaf whose height in the
+		 *            tree is not known.
+		 * @param recursive
+		 *            When true, the node will be dumped recursively using a
+		 *            pre-order traversal.
+		 * @param materialize
+		 *            When <code>true</code>, children will be materialized as
+		 *            necessary to dump the tree.
+		 *            
+		 * @return <code>true</code> unless an inconsistency was detected.
+		 */
+		abstract protected boolean dump(Level level, PrintStream out,
+				int height, boolean recursive, boolean materialize);
+
 	} // class AbstractPage
 
 	/**
@@ -563,10 +1176,20 @@
 	 * reaches this condition of containing only duplicate keys we could of
 	 * course compress the keys enormously since they are all the same.
 	 * <p>
-	 * While the #of tuples in a buddy is governed by the global depth of the
-	 * bucket, the #of tuples in a bucket page consisting of a single buddy
-	 * bucket might be governed by the target page size. (Perhaps we could even
-	 * split buddy buckets if the page size is exceeded?) 
+	 * This works well when we only store the address of the objects in the
+	 * bucket (rather than the objects themselves, e.g., raw records mode) and
+	 * choose the address bits based on the expected size of a tuple record.
+	 * However, we can also accommodate tuples with varying size (think binding
+	 * sets) with in page locality if split the buddy bucket with the most bytes
+	 * when an insert into the page would exceed the target page size. This
+	 * looks pretty much like the case above, except that we split buddy buckets
+	 * based on not only whether their alloted #of slots are filled by tuples
+	 * but also based on the data on the page.
+	 * 
+	 * TODO Delete markers will also require some thought. Unless we can purge
+	 * them out at the tx commit point, we can wind up with a full bucket page
+	 * consisting of a single buddy bucket filled with deleted tuples all having
+	 * the same key.
 	 */
     static class BucketPage extends AbstractPage implements ILeafData { // TODO IBucketData
 
@@ -681,71 +1304,399 @@
 
 			super(htree, true/* dirty */, globalDepth);
             
-            data = new MutableLeafData(htree.branchingFactor,
-                    htree.versionTimestamps, htree.deleteMarkers,
-                    htree.rawRecords);
+			// TODO keysRaba must allow nulls!
+            data = new MutableLeafData(//
+            		(1<<htree.addressBits), // fan-out
+                    htree.versionTimestamps,//
+                    htree.deleteMarkers,//
+                    htree.rawRecords//
+                    );
 //            new MutableBucketData(data)
 
         }
 
-    } // class BucketPage
+		/**
+		 * Return <code>true</code> if there is at lease one tuple in the buddy
+		 * hash bucket for the specified key.
+		 * 
+		 * @param key
+		 *            The key.
+		 * @param buddyOffset
+		 *            The offset within the {@link BucketPage} of the buddy hash
+		 *            bucket to be searched.
+		 *            
+		 * @return <code>true</code> if a tuple is found in the buddy hash
+		 *         bucket for the specified key.
+		 */
+		boolean contains(final byte[] key, final int buddyOffset) {
 
-	/**
-	 * An {@link HTree} directory page (node). Each directory page will hold one
-	 * or more "buddy" hash tables. The #of buddy hash tables on the page
-	 * depends on the globalDepth of the page and the addressBits as computed by
-	 * {@link DirectoryPage#getNumBuddies()}.
-	 */
-    static class DirectoryPage extends AbstractPage implements IDirectoryData {
+			if (key == null)
+				throw new IllegalArgumentException();
 
-    	/**
-    	 * Transient references to the children.
-    	 */
-		final Reference<AbstractPage>[] refs;
-		
+			// #of slots on the page.
+			final int slotsOnPage = (1 << htree.addressBits);
+
+			// #of address slots in each buddy hash table.
+			final int slotsPerBuddy = (1 << globalDepth);
+
+//			// #of buddy tables on a page.
+//			final int nbuddies = (slotsOnPage) / slotsPerBuddy;
+
+			final int lastSlot = buddyOffset + slotsPerBuddy;
+
+			// range check buddyOffset.
+			if (buddyOffset < 0 || buddyOffset >= slotsOnPage)
+				throw new IndexOutOfBoundsException();
+
+			/*
+			 * Locate the first unassigned tuple in the buddy bucket.
+			 * 
+			 * TODO Faster comparison with a coded key in the raba by either (a)
+			 * asking the raba to do the equals() test; or (b) copying the key
+			 * from the raba into a buffer which we reuse for each test. This is
+			 * another way in which the hash table keys raba differs from the
+			 * btree keys raba.
+			 */
+			final IRaba keys = getKeys();
+			for (int i = buddyOffset; i < lastSlot; i++) {
+				if (!keys.isNull(i)) {
+					if(BytesUtil.bytesEqual(key,keys.get(i))) {
+						return true;
+					}
+				}
+			}
+			return false;
+		}
+
 		/**
-		 * Persistent data.
+		 * Return the first value found in the buddy hash bucket for the
+		 * specified key.
+		 * 
+		 * @param key
+		 *            The key.
+		 * @param buddyOffset
+		 *            The offset within the {@link BucketPage} of the buddy hash
+		 *            bucket to be searched.
+		 * 
+		 * @return The value associated with the first tuple found in the buddy
+		 *         hash bucket for the specified key and <code>null</code> if no
+		 *         such tuple was found. Note that the return value is not
+		 *         diagnostic if the application allows <code>null</code> values
+		 *         into the index.
 		 */
-        IDirectoryData data;
+		final byte[] lookupFirst(final byte[] key, final int buddyOffset) {
 
+			if (key == null)
+				throw new IllegalArgumentException();
+
+			// #of slots on the page.
+			final int slotsOnPage = (1 << htree.addressBits);
+
+			// #of address slots in each buddy hash table.
+			final int slotsPerBuddy = (1 << globalDepth);
+
+//			// #of buddy tables on a page.
+//			final int nbuddies = (slotsOnPage) / slotsPerBuddy;
+
+			final int lastSlot = buddyOffset + slotsPerBuddy;
+
+			// range check buddyOffset.
+			if (buddyOffset < 0 || buddyOffset >= slotsOnPage)
+				throw new IndexOutOfBoundsException();
+
+			/*
+			 * Locate the first unassigned tuple in the buddy bucket.
+			 * 
+			 * TODO Faster comparison with a coded key in the raba by either (a)
+			 * asking the raba to do the equals() test; or (b) copying the key
+			 * from the raba into a buffer which we reuse for each test. This is
+			 * another way in which the hash table keys raba differs from the
+			 * btree keys raba.
+			 */
+			final IRaba keys = getKeys();
+			for (int i = buddyOffset; i < lastSlot; i++) {
+				if (!keys.isNull(i)) {
+					if(BytesUtil.bytesEqual(key,keys.get(i))) {
+						return getValues().get(i);
+					}
+				}
+			}
+			return null;
+		}
+
 		/**
-		 * Get the child indexed by the key.
-		 * <p>
-		 * Note: The recursive descent pattern requires the caller to separately
-		 * compute the buddyOffset before each descent into a child.
+		 * Return an iterator which will visit each tuple in the buddy hash
+		 * bucket for the specified key.
 		 * 
 		 * @param key
 		 *            The key.
-		 * @param prefixLength
-		 *            The #of MSB bits in the key which are to be ignored at
-		 *            this level of the hash tree. This is computed dynamically
-		 *            during recursive traversal of the hash tree. This is ZERO
-		 *            (0) for the root directory. It is incremented by
-		 *            <i>globalDepth</i> at each level of recursion for insert,
-		 *            lookup, etc.
 		 * @param buddyOffset
+		 *            The offset within the {@link BucketPage} of the buddy hash
+		 *            bucket to be searched.
+		 * 
+		 * @return An iterator which will visit each tuple in the buddy hash
+		 *         table for the specified key and never <code>null</code>.
+		 * 
+		 *         TODO Specify the contract for concurrent modification both
+		 *         here and on the {@link HTree#lookupAll(byte[])} methods.
+		 */
+		final ITupleIterator lookupAll(final byte[] key, final int buddyOffset) {
+
+			return new BuddyBucketTupleIterator(key, this, buddyOffset);
+
+		}
+
+		/**
+		 * Insert the tuple into the buddy bucket.
+		 * 
+		 * @param key
+		 *            The key (all bits, all bytes).
+		 * @param val
+		 *            The value (optional).
+		 * @param parent
+		 *            The parent {@link DirectoryPage} and never
+		 *            <code>null</code> (this is required for the copy-on-write
+		 *            pattern).
+		 * @param buddyOffset
 		 *            The offset into the child of the first slot for the buddy
 		 *            hash table or buddy hash bucket.
 		 * 
-		 * @return The child indexed by the key.
+		 * @return <code>false</code> iff the buddy bucket must be split.
 		 * 
-		 * @see HTreeUtil#getBuddyOffset(int, int, int)
-		 * 
-		 *      TODO This method signature would require us to invoke
-		 *      {@link #getLocalHashCode(byte[], int)} twice (once here and once
-		 *      in support of obtaining the buddyOffset). That suggests that the
-		 *      caller should invoke the method once and then use the
-		 *      int32:hashBits variant and we would then drop this version of
-		 *      the method.
+		 * @throws IllegalArgumentException
+		 *             if <i>key</i> is <code>null</code>.
+		 * @throws IllegalArgumentException
+		 *             if <i>parent</i> is <code>null</code>.
+		 * @throws IndexOutOfBoundsException
+		 *             if <i>buddyOffset</i> is out of the allowed range.
 		 */
-		protected AbstractPage getChild(final byte[] key,
-				final int prefixLength, final int buddyOffset) {
+		boolean insert(final byte[] key, final byte[] val,
+				final DirectoryPage parent, 
+				final int buddyOffset) {
 
-			return getChild(getLocalHashCode(key, prefixLength), buddyOffset);
+			if (key == null)
+				throw new IllegalArgumentException();
 
+			if (parent == null)
+				throw new IllegalArgumentException();
+
+			// #of slots on the page.
+			final int slotsOnPage = (1 << htree.addressBits);
+
+			// #of address slots in each buddy hash table.
+			final int slotsPerBuddy = (1 << globalDepth);
+
+			// #of buddy tables on a page.
+			final int nbuddies = (slotsOnPage) / slotsPerBuddy;
+
+			final int lastSlot = buddyOffset + slotsPerBuddy;
+
+			// range check buddyOffset.
+			if (buddyOffset < 0 || buddyOffset >= slotsOnPage)
+				throw new IndexOutOfBoundsException();
+
+			// TODO if(!mutable) copyOnWrite().insert(key,val,parent,buddyOffset);
+
+			/*
+			 * Locate the first unassigned tuple in the buddy bucket.
+			 * 
+			 * Note: Given the IRaba data structure, this will require us to
+			 * examine the keys for a null. The "keys" rabas do not allow nulls,
+			 * so we will need to use a "values" raba (nulls allowed) for the
+			 * bucket keys. Unless we keep the entries in a buddy bucket dense
+			 * (maybe making them dense when they are persisted for faster
+			 * scans, but why bother for mutable buckets?) we will have to scan
+			 * the entire buddy bucket to find an open slot (or just to count
+			 * the #of slots which are currently in use).
+			 * 
+			 * FIXME We need a modified MutableKeysRaba for this purpose. It
+			 * will have to allow nulls in the middle (unless we compact
+			 * everything). [We will have to compact prior to coding in order
+			 * for the assumptions of the values coder to be maintained.] It
+			 * would be pretty easy to just swap in the first undeleted tuple
+			 * but that will not keep things dense. Likewise, Monet style
+			 * cracking could only be done within a buddy bucket rather than
+			 * across all buddies on a page.
+			 */
+			final MutableKeyBuffer keys = (MutableKeyBuffer)getKeys();
+			final MutableValueBuffer vals = (MutableValueBuffer)getValues();
+			
+			for (int i = buddyOffset; i < lastSlot; i++) {
+				if (keys.isNull(i)) {
+					keys.nkeys++;
+					keys.keys[i] = key;
+					vals.nvalues++;
+					vals.values[i] = val;
+					// TODO deleteMarker:=false
+					// TODO versionTimestamp:=...
+					((HTree)htree).nentries++;
+					// insert Ok.
+					return true;
+				}
+			}
+
+			/*
+			 * Any buddy bucket which is full is split unless it is the sole
+			 * buddy in the page since a split doubles the size of the buddy
+			 * bucket (unless it is the only buddy on the page) and the tuple
+			 * can therefore be inserted after a split. [This rule is not
+			 * perfect if we allow splits to be driven by the bytes on a page,
+			 * but it should still be Ok.]
+			 * 
+			 * Before we can split the sole buddy bucket in a page, we need to
+			 * know whether or not the keys are identical. If they are then we
+			 * let the page grow rather than splitting it. This can be handled
+			 * insert of bucketPage.insert(). It can have a boolean which is set
+			 * false as soon as it sees a key which is not the equals() to the
+			 * probe key (in all bits).
+			 * 
+			 * Note that an allowed split always leaves enough room for another
+			 * tuple (when considering only the #of tuples and not their bytes
+			 * on the page). We can still be "out of space" in terms of bytes on
+			 * the page, even for a single tuple. In this edge case, the tuple
+			 * should really be a raw record. That is easily controlled by
+			 * having a maximum inline value byte[] length for a page - probably
+			 * on the order of pageSize/16 which works out to 256 bytes for a 4k
+			 * page.
+			 */
+			if (nbuddies != 1) {
+				/*
+				 * Force a split since there is more than one buddy on the page.
+				 */
+				return false;
+			}
+
+			/*
+			 * There is only one buddy on the page. Now we have to figure out
+			 * whether or not all keys are duplicates.
+			 */
+			boolean identicalKeys = true;
+			for (int i = buddyOffset; i < buddyOffset + slotsPerBuddy; i++) {
+				if(!BytesUtil.bytesEqual(key,keys.get(i))) {
+					identicalKeys = false;
+					break;
+				}
+			}
+			if(!identicalKeys) {
+				/*
+				 * Force a split since it is possible to redistribute some
+				 * tuples.
+				 */
+				return false;
+			}
+
+			/*
+			 * Since the page is full, we need to grow the page (or chain an
+			 * overflow page) rather than splitting the page.
+			 * 
+			 * TODO Maybe the easiest thing to do is just double the target #of
+			 * slots on the page. We would rely on keys.capacity() in this case
+			 * rather than #slots. In fact, we could just reenter the method
+			 * above after doubling as long as we rely on keys.capacity() in the
+			 * case where nbuddies==1.
+			 */
+        	throw new UnsupportedOperationException();
+        }
+
+	    /**
+	     * Human readable representation of the {@link ILeafData} plus transient
+	     * information associated with the {@link BucketPage}.
+	     */
+	    @Override
+	    public String toString() {
+
+	        final StringBuilder sb = new StringBuilder();
+
+	        sb.append(super.toString());
+
+	        sb.append("{ isDirty="+isDirty());
+
+	        sb.append(", isDeleted="+isDeleted());
+	        
+	        sb.append(", addr=" + identity);
+
+	        final DirectoryPage p = (parent == null ? null : parent.get());
+
+	        sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));
+
+	        sb.append(", globalDepth=" + getGlobalDepth());
+
+	        if (data == null) {
+
+	            // No data record? (Generally, this means it was stolen by copy on
+	            // write).
+	            sb.append(", data=NA}");
+
+	            return sb.toString();
+	            
+	        }
+
+	        sb.append(", nkeys=" + getKeyCount());
+	        
+//	        sb.append(", minKeys=" + minKeys());
+//
+//	        sb.append(", maxKeys=" + maxKeys());
+	        
+	        DefaultLeafCoder.toString(this, sb);
+
+	        sb.append("}");
+
+	        return sb.toString();
+	        
+	    }
+
+	    protected boolean dump(final Level level, final PrintStream out,
+				final int height, final boolean recursive,
+				final boolean materialize) {
+
+	        final boolean debug = level.toInt() <= Level.DEBUG.toInt();
+	        
+	        // Set to false iff an inconsistency is detected.
+	        boolean ok = true;
+
+	        final int globalDepth = getGlobalDepth();
+
+			if (parent == null || parent.get() == null) {
+	            out.println(indent(height) + "ERROR: parent not set");
+				ok = false;
+	        }
+			
+			if (globalDepth > parent.get().globalDepth) {
+	            out.println(indent(height) + "ERROR: localDepth exceeds globalDepth of parent");
+				ok = false;
+			}
+	        
+	        if (debug || ! ok ) {
+	            
+	            out.println(indent(height) + toString());
+
+	        }
+
+	        return ok;
+	        
 		}
 
+    } // class BucketPage
+	
+	/**
+	 * An {@link HTree} directory page (node). Each directory page will hold one
+	 * or more "buddy" hash tables. The #of buddy hash tables on the page
+	 * depends on the globalDepth of the page and the addressBits as computed by
+	 * {@link DirectoryPage#getNumBuddies()}.
+	 */
+    static class DirectoryPage extends AbstractPage implements IDirectoryData {
+
+    	/**
+    	 * Transient references to the children.
+    	 */
+		final Reference<AbstractPage>[] childRefs;
+		
 		/**
+		 * Persistent data.
+		 */
+        IDirectoryData data;
+
+		/**
 		 * Get the child indexed by the key.
 		 * <p>
 		 * Note: The recursive descent pattern requires the caller to separately
@@ -780,7 +1731,7 @@
 			 * then we are done and we can return the child reference and the
 			 * offset of the buddy table or bucket within the child. 
 			 */
-			final Reference<AbstractPage> ref = refs[index];
+			final Reference<AbstractPage> ref = childRefs[index];
 			
 			AbstractPage child = ref == null ? null : ref.get();
 			
@@ -847,7 +1798,7 @@
 
 					if (data.getChildAddr(i) == addr) {
 
-						refs[i] = (Reference) child.self;
+						childRefs[i] = (Reference) child.self;
 						
 						n++;
 						
@@ -862,112 +1813,7 @@
 			return child;
 			
 		}
-
-//		/**
-//		 * Return the address of the child page indexed by the relevant bits of
-//		 * the key.
-//		 * 
-//		 * @param hashBits
-//		 *            An int32 extract revealing only the relevant bits of the
-//		 *            key for the current {@link DirectoryPage}.
-//		 * @param offset
-//		 *            Index position of the start of the buddy hash table in the
-//		 *            page. This is known to the caller when inspecting the
-//		 *            parent directory and is the index of the slot within the
-//		 *            buddy hash table.
-//		 * 
-//		 * @return The address of the child.
-//		 */
-//		public long getChildAddrByHashCode(final int hashBits, final int offset) {
-//			// width of a buddy hash table in pointer slots.
-//			final int tableWidth = (1 << globalDepth);
-//			// index position of the start of the buddy hash table in the page.
-//			final int tableOffset = (tableWidth * offset);
-//			// index of the slot in the buddy hash table for the given hash
-//			// bits.
-//			final int index = tableOffset + hashBits;
-//			// the address of the child for that slot.
-//			final long addr = data.getChildAddr(index);
-//			return addr;
-//		}
-
-//		/**
-//		 * The #of bits in the key examined at a given level in the tree. The
-//		 * range is [0:addressBits].
-//		 * <p>
-//		 * This depends solely on the <i>globalDepth</i> of a directory page and
-//		 * the #of pointers to child (<i>npointers</i>) in that directory page.
-//		 * The local depth is obtained counting the #of slots in the appropriate
-//		 * buddy hash table of the parent which are mapped onto the same child.
-//		 * Given 2^(d-i) such entries in the parent, the local depth of that
-//		 * child is i. Some fancy bit work may be used to solve for i.
-//		 * <p>
-//		 * The local depth of a node may not exceed the global depth of its
-//		 * parent. If splitting a buddy bucket would cause the local depth to
-//		 * exceed the global depth of the parent, then the global depth of the
-//		 * parent is increased, which causes the parent to be split in turn.
-//		 * Since the root directory does not have a parent its local depth is
-//		 * not defined.
-//		 * 
-//		 * @param child
-//		 *            A reference to a direct child of this
-//		 *            {@link DirectoryPage}.
-//		 * 
-//		 * @throws IllegalArgumentException
-//		 *             if the <i>child</i> is <code>null</code>
-//		 * @throws IllegalArgumentException
-//		 *             if the <i>child</i> is not a direct child of this
-//		 *             {@link DirectoryPage}.
-//		 */
-//		int getLocalDepth(final AbstractPage child) {
-//
-//			final int npointers = co...
 
[truncated message content]

[Bigdata-commit] SF.net SVN: bigdata:[4477] branches/QUADS_QUERY_BRANCH/bigdata/src

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4477] branches/QUADS_QUERY_BRANCH/bigdata/src