[Bigdata-commit] SF.net SVN: bigdata:[4208] branches/QUADS_QUERY_BRANCH/bigdata

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4208
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4208&view=rev
Author:   thompsonbry
Date:     2011-02-18 19:39:00 +0000 (Fri, 18 Feb 2011)

Log Message:
-----------
Added support for random sampling of the standalone database B+Tree indices in support of the Runtime Query Optimizer.

Modified the Advancer pattern to permit one-time initialization of the advancer.

Added information about the selected join path to the JoinGraph INFO trace.

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -27,10 +27,16 @@
 
 package com.bigdata.bop.ap;
 
+import it.unimi.dsi.bits.BitVector;
+import it.unimi.dsi.bits.LongArrayBitVector;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Random;
 import java.util.concurrent.Callable;
 
 import com.bigdata.bop.AbstractAccessPathOp;
@@ -45,6 +51,7 @@
 import com.bigdata.btree.ITupleCursor;
 import com.bigdata.btree.filter.Advancer;
 import com.bigdata.btree.view.FusedView;
+import com.bigdata.rawstore.Bytes;
 import com.bigdata.relation.IRelation;
 import com.bigdata.relation.accesspath.AccessPath;
 import com.bigdata.relation.accesspath.IAccessPath;
@@ -79,6 +86,27 @@
     private static final long serialVersionUID = 1L;
 
 	/**
+	 * Typesafe enumeration of different kinds of index sampling strategies.
+	 * 
+	 * @todo It is much more efficient to take clusters of samples when you can
+	 *       accept the bias. Taking a clustered sample really requires knowing
+	 *       where the leaf boundaries are in the index, e.g., using
+	 *       {@link ILeafCursor}. Taking all tuples from a few leaves in each
+	 *       sample might produce a faster estimation of the correlation when
+	 *       sampling join paths.
+	 */
+	public static enum SampleType {
+		/**
+		 * Samples are taken at even space offsets.
+		 */
+		EVEN,
+		/**
+		 * Sample offsets are computed randomly.
+		 */
+		RANDOM;
+	}
+
+	/**
 	 * Known annotations.
 	 */
 	public interface Annotations extends BOp.Annotations {
@@ -86,16 +114,32 @@
 		/**
 		 * The sample limit (default {@value #DEFAULT_LIMIT}).
 		 */
-		String LIMIT = "limit";
+		String LIMIT = SampleIndex.class.getName() + ".limit";
 
 		int DEFAULT_LIMIT = 100;
 
 		/**
+		 * The random number generator seed -or- ZERO (0L) for a random seed
+		 * (default {@value #DEFAULT_SEED}). A non-zero value may be used to
+		 * create a repeatable sample.
+		 */
+		String SEED = SampleIndex.class.getName() + ".seed";
+		
+		long DEFAULT_SEED = 0L;
+		
+		/**
 		 * The {@link IPredicate} describing the access path to be sampled
 		 * (required).
 		 */
 		String PREDICATE = SampleIndex.class.getName() + ".predicate";
+
+		/**
+		 * The type of sample to take (default {@value #DEFAULT_SAMPLE_TYPE)}.
+		 */
+		String SAMPLE_TYPE = SampleIndex.class.getName() + ".sampleType";
 		
+		String DEFAULT_SAMPLE_TYPE = SampleType.RANDOM.name();
+
 	}
 
     public SampleIndex(SampleIndex<E> op) {
@@ -115,7 +159,20 @@
 		return getProperty(Annotations.LIMIT, Annotations.DEFAULT_LIMIT);
         
     }
+	
+	public long seed() {
 
+		return getProperty(Annotations.SEED, Annotations.DEFAULT_SEED);
+        
+    }
+	
+	public SampleType getSampleType() {
+
+		return SampleType.valueOf(getProperty(Annotations.SAMPLE_TYPE,
+				Annotations.DEFAULT_SAMPLE_TYPE));
+		
+	}
+
     @SuppressWarnings("unchecked")
     public IPredicate<E> getPredicate() {
         
@@ -195,7 +252,7 @@
 		/** Return a sample from the access path. */
 		public E[] call() throws Exception {
 
-			return sample(limit(), getPredicate()).getSample();
+			return sample(limit(), getSampleType(), getPredicate()).getSample();
 
 		}
 
@@ -206,7 +263,7 @@
 		 * @return
 		 */
 		public AccessPathSample<E> sample(final int limit,
-				IPredicate<E> predicate) {
+				final SampleType sampleType, IPredicate<E> predicate) {
 
 			final IRelation<E> relation = context.getRelation(predicate);
 
@@ -242,10 +299,25 @@
 			/*
 			 * Add advancer to collect sample.
 			 */
+
+			final Advancer<E> advancer;
+			switch (sampleType) {
+			case EVEN:
+				advancer = new EvenSampleAdvancer<E>(// rangeCount,
+						limit, accessPath.getFromKey(), accessPath.getToKey());
+				break;
+			case RANDOM:
+				advancer = new RandomSampleAdvancer<E>(// rangeCount,
+						seed(), limit, accessPath.getFromKey(), accessPath
+								.getToKey());
+				break;
+			default:
+				throw new UnsupportedOperationException("SampleType="
+						+ sampleType);
+			}
+			
 			predicate = ((Predicate<E>) predicate)
-					.addIndexLocalFilter(new SampleAdvancer<E>(//rangeCount,
-							limit, accessPath.getFromKey(), accessPath
-									.getToKey()));
+					.addIndexLocalFilter(advancer);
 
 			return new AccessPathSample<E>(limit, context.getAccessPath(
 					relation, predicate));
@@ -256,20 +328,21 @@
 
 	/**
 	 * An advancer pattern which is designed to take evenly distributed samples
-	 * from an index. The caller specifies the #of tuples to be skipped after
-	 * each tuple visited. That number should be computed based on the estimated
-	 * range count of the index and the desired sample size. This can fail to
-	 * gather the desired number of sample if additional filters are applied
-	 * which further restrict the elements selected by the predicate. However,
-	 * it will still faithfully represent the expected cardinality of the
-	 * sampled access path.
+	 * from an index. The caller specifies the #of tuples to be sampled. This
+	 * class estimates the range count of the access path and then computes the
+	 * #of samples to be skipped after each tuple visited.
+	 * <p>
+	 * Note: This can fail to gather the desired number of sample if additional
+	 * filters are applied which further restrict the elements selected by the
+	 * predicate. However, it will still faithfully represent the expected
+	 * cardinality of the sampled access path (tuples tested).
 	 * 
 	 * @author tho...@us...
 	 * 
 	 * @param <E>
 	 *            The generic type of the elements visited by that access path.
 	 */
-	private static class SampleAdvancer<E> extends Advancer<E> {
+	private static class EvenSampleAdvancer<E> extends Advancer<E> {
 
 		private static final long serialVersionUID = 1L;
 
@@ -296,30 +369,13 @@
 		 * @param limit
 		 *            The #of samples to visit.
 		 */
-		public SampleAdvancer(final int limit, final byte[] fromKey,
+		public EvenSampleAdvancer(final int limit, final byte[] fromKey,
 				final byte[] toKey) {
 
 			this.limit = limit;
 			this.toKey = toKey;
 		}
 
-		/**
-		 * @todo This is taking evenly spaced samples. It is much more efficient
-		 *       to take clusters of samples when you can accept the bias.
-		 *       Taking a clustered sample really requires knowing where the
-		 *       leaf boundaries are in the index, e.g., using
-		 *       {@link ILeafCursor}.
-		 *       <p>
-		 *       Taking all tuples from a few leaves in each sample might
-		 *       produce a faster estimation of the correlation when sampling
-		 *       join paths.
-		 * 
-		 * @todo Rather than evenly spaced samples, we should be taking a random
-		 *       sample. This could be achieved using a random initial offset
-		 *       and random increment as long as the initial offset was in the
-		 *       range of a single increment and we compute the increment such
-		 *       that N+1 intervals exist.
-		 */
 		@Override
 		protected void advance(final ITuple<E> tuple) {
 
@@ -336,6 +392,11 @@
 				toIndex = toKey == null ? ndx.getEntryCount() : ndx
 						.indexOf(toKey);
 
+				if (toIndex < 0) {
+					// convert insert position to index.
+					toIndex = -toIndex + 1;
+				}
+
 				final int rangeCount = (toIndex - fromIndex);
 
 				skipCount = Math.max(1, rangeCount / limit);
@@ -365,9 +426,125 @@
 
 		}
 
-	} // class SampleAdvancer
+	} // class EvenSampleAdvancer
 
 	/**
+	 * An advancer pattern which is designed to take randomly distributed
+	 * samples from an index. The caller specifies the #of tuples to be sampled.
+	 * This class estimates the range count of the access path and then computes
+	 * a set of random offsets into the access path from which it will collect
+	 * the desired #of samples.
+	 * <p>
+	 * Note: This can fail to gather the desired number of sample if additional
+	 * filters are applied which further restrict the elements selected by the
+	 * predicate. However, it will still faithfully represent the expected
+	 * cardinality of the sampled access path (tuples tested).
+	 * 
+	 * @author tho...@us...
+	 * 
+	 * @param <E>
+	 *            The generic type of the elements visited by that access path.
+	 */
+	private static class RandomSampleAdvancer<E> extends Advancer<E> {
+
+		private static final long serialVersionUID = 1L;
+
+		/** The random number generator seed. */
+		private final long seed;
+		
+		/** The desired total limit on the sample. */
+		private final int limit;
+
+		private final byte[] fromKey, toKey;
+
+		/*
+		 * Transient data. This gets initialized when we visit the first tuple.
+		 */
+		
+		/** The offset of each tuple to be sampled. */
+		private transient int[] offsets;
+		/** The #of tuples accepted so far. */
+		private transient int nread = 0;
+		/** The inclusive lower bound of the first tuple actually visited. */
+		private transient int fromIndex;
+		/** The exclusive upper bound of the last tuple which could be visited. */
+		private transient int toIndex;
+		
+		/**
+		 * 
+		 * @param limit
+		 *            The #of samples to visit.
+		 */
+		public RandomSampleAdvancer(final long seed, final int limit,
+				final byte[] fromKey, final byte[] toKey) {
+
+			this.seed = seed;
+			this.limit = limit;
+			this.fromKey = fromKey;
+			this.toKey = toKey;
+		}
+
+		@Override
+		protected boolean init() {
+
+			final AbstractBTree ndx = (AbstractBTree) src.getIndex();
+
+			// inclusive lower bound.
+			fromIndex = fromKey == null ? 0 : ndx.indexOf(fromKey);
+
+			if (fromIndex < 0) {
+				// convert insert position to index.
+				fromIndex = -fromIndex + 1;
+			}
+
+			// exclusive upper bound.
+			toIndex = toKey == null ? ndx.getEntryCount() : ndx.indexOf(toKey);
+
+			if (toIndex < 0) {
+				// convert insert position to index.
+				toIndex = -toIndex + 1;
+			}
+
+			// get offsets to be sampled.
+			offsets = new SmartOffsetSampler().getOffsets(seed, limit,
+					fromIndex, toIndex);
+
+			// Skip to the first tuple.
+			src.seek(ndx.keyAt(offsets[0]));
+
+			return true;
+			
+		}
+
+		@Override
+		protected void advance(final ITuple<E> tuple) {
+
+			final AbstractBTree ndx = (AbstractBTree) src.getIndex();
+
+			if (nread < offsets.length - 1) {
+
+				/*
+				 * Skip to the next tuple.
+				 */
+
+				final int nextIndex = offsets[nread];
+
+//				System.err.println("limit=" + limit + ", rangeCount="
+//						+ (toIndex - fromIndex) + ", fromIndex=" + fromIndex
+//						+ ", toIndex=" + toIndex + ", currentIndex="
+//						+ currentIndex + ", nextIndex=" + nextIndex);
+				
+				src.seek(ndx.keyAt(nextIndex));
+
+			}
+
+			nread++;
+			
+		}
+
+	} // class RandomSampleAdvancer
+
+	/**
 	 * A sample from an access path.
 	 * 
 	 * @param <E>
@@ -459,4 +636,355 @@
 
 	} // AccessPathSample
 
+	/**
+	 * Interface for obtaining an array of tuple offsets to be sampled.
+	 * 
+	 * @author thompsonbry
+	 */
+	public interface IOffsetSampler {
+
+		/**
+		 * Return an array of tuple indices which may be used to sample a key
+		 * range of some index.
+		 * <p>
+		 * Note: The caller must stop when it runs out of offsets, not when the
+		 * limit is satisfied, as there will be fewer offsets returned when the
+		 * half open range is smaller than the limit.
+		 * 
+		 * @param seed
+		 *            The seed for the random number generator -or- ZERO (0L)
+		 *            for a random seed. A non-zero value may be used to create
+		 *            a repeatable sample.
+		 * @param limit
+		 *            The maximum #of tuples to sample.
+		 * @param fromIndex
+		 *            The inclusive lower bound.
+		 * @param toIndex
+		 *            The exclusive upper bound0
+		 * 
+		 * @return An array of at most <i>limit</i> offsets into the index. The
+		 *         offsets will lie in the half open range (fromIndex,toIndex].
+		 *         The elements of the array will be in ascending order. No
+		 *         offsets will be repeated.
+		 * 
+		 * @throws IllegalArgumentException
+		 *             if <i>limit</i> is non-positive.
+		 * @throws IllegalArgumentException
+		 *             if <i>fromIndex</i> is negative.
+		 * @throws IllegalArgumentException
+		 *             if <i>toIndex</i> is negative.
+		 * @throws IllegalArgumentException
+		 *             unless <i>toIndex</i> is GT <i>fromIndex</i>.
+		 */
+		int[] getOffsets(final long seed, int limit, final int fromIndex,
+				final int toIndex);
+	}
+
+	/**
+	 * A smart implementation which uses whichever implementation is most
+	 * efficient for the limit and key range to be sampled.
+	 * 
+	 * @author thompsonbry
+	 */
+	public static class SmartOffsetSampler implements IOffsetSampler {
+
+		/**
+		 * {@inheritDoc}
+		 */
+		public int[] getOffsets(final long seed, int limit,
+				final int fromIndex, final int toIndex) {
+
+			if (limit < 1)
+				throw new IllegalArgumentException();
+			if (fromIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex <= fromIndex)
+				throw new IllegalArgumentException();
+			
+			final int rangeCount = (toIndex - fromIndex);
+
+			if (limit > rangeCount)
+				limit = rangeCount;
+
+			if (limit == rangeCount) {
+
+				// Visit everything.
+				return new EntireRangeOffsetSampler().getOffsets(seed, limit,
+						fromIndex, toIndex);
+				
+			}
+
+			/*
+			 * Random offsets visiting a subset of the key range using a
+			 * selection without replacement pattern (the same tuple is never
+			 * visited twice).
+			 * 
+			 * FIXME When the limit approaches the range count and the range
+			 * count is large (too large for a bit vector or acceptance set
+			 * approach), then we are better off creating a hash set of offsets
+			 * NOT to be visited and then just choosing (rangeCount-limit)
+			 * offsets to reject. This will be less expensive than computing the
+			 * acceptance set directly. However, to really benefit from the
+			 * smaller memory profile, we would also need to wrap that with an
+			 * iterator pattern so the smaller memory representation could be of
+			 * use when the offset[] is applied (e.g., modify the IOffsetSampler
+			 * interface to be an iterator with various ctor parameters rather
+			 * than returning an array as we do today).
+			 */
+			
+			// FIXME BitVectorOffsetSampler is broken.
+			if (false && rangeCount < Bytes.kilobyte32 * 8) {
+
+				// NB: 32k range count uses a 4k bit vector.
+				return new BitVectorOffsetSampler().getOffsets(seed, limit,
+						fromIndex, toIndex);
+			
+			}
+
+			/*
+			 * When limit is small (or significantly smaller than the
+			 * rangeCount), then we are much better off creating a hash set of
+			 * the offsets which have been accepted.
+			 * 
+			 * Good unless [limit] is very large.
+			 */
+			return new AcceptanceSetOffsetSampler().getOffsets(seed, limit,
+					fromIndex, toIndex);
+			
+		}
+
+	}
+
+	/**
+	 * Returns all offsets in the half-open range, but may only be used when
+	 * the limit GTE the range count.
+	 */
+	static public class EntireRangeOffsetSampler implements IOffsetSampler {
+
+		/**
+		 * {@inheritDoc}
+		 * 
+		 * @throws UnsupportedOperationException
+		 *             if <i>limit!=rangeCount</i> (after adjusting for limits
+		 *             greater than the rangeCount).
+		 */
+		public int[] getOffsets(final long seed, int limit,
+				final int fromIndex, final int toIndex) {
+
+			if (limit < 1)
+				throw new IllegalArgumentException();
+			if (fromIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex <= fromIndex)
+				throw new IllegalArgumentException();
+
+			final int rangeCount = (toIndex - fromIndex);
+
+			if (limit > rangeCount)
+				limit = rangeCount;
+
+			if (limit != rangeCount)
+				throw new UnsupportedOperationException();
+
+			// offsets of tuples to visit.
+			final int[] offsets = new int[limit];
+
+			for (int i = 0; i < limit; i++) {
+
+				offsets[i] = fromIndex + i;
+
+			}
+
+			return offsets;
+
+		}
+	}
+
+	/**
+	 * Return a randomly selected ordered array of offsets in the given
+	 * half-open range.
+	 * <p>
+	 * This approach is based on a bit vector. If the bit is already marked,
+	 * then the offset has been used and we scan until we find the next free
+	 * offset. This requires [rangeCount] bits, so it works well when the
+	 * rangeCount of the key range is small. For example, a range count of 32k
+	 * requires a 4kb bit vector, which is quite manageable.
+	 * 
+	 * FIXME There is something broken in this class, probably an assumption I
+	 * have about how {@link LongArrayBitVector} works. If you enable it in the
+	 * stress test, it will fail.
+	 */
+	static public class BitVectorOffsetSampler implements IOffsetSampler {
+
+		public int[] getOffsets(final long seed, int limit,
+				final int fromIndex, final int toIndex) {
+
+			if (limit < 1)
+				throw new IllegalArgumentException();
+			if (fromIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex <= fromIndex)
+				throw new IllegalArgumentException();
+
+			final int rangeCount = (toIndex - fromIndex);
+
+			if (limit > rangeCount)
+				limit = rangeCount;
+
+			// offsets of tuples to visit.
+			final int[] offsets = new int[limit];
+
+			// create a cleared bit vector of the stated capacity.
+			final BitVector v = LongArrayBitVector.ofLength(//
+					rangeCount// capacity (in bits)
+					);
+
+			// Random number generator using caller's seed (if given).
+			final Random rnd = seed == 0L ? new Random() : new Random(seed);
+
+			// Choose random tuple indices for the remaining tuples.
+			for (int i = 0; i < limit; i++) {
+
+				/*
+				 * Look for an unused bit starting at this index. If necessary,
+				 * this will wrap around to zero.
+				 */
+
+				// k in (0:rangeCount-1).
+				int k = rnd.nextInt(rangeCount);
+
+				if (v.getBoolean((long) k)) {
+					// This bit is already taken.
+					final long nextZero = v.nextZero((long) k);
+					if (nextZero != -1L) {
+						k = (int) nextZero;
+					} else {
+						final long priorZero = v.previousZero((long) k);
+						if (priorZero != -1L) {
+							k = (int) priorZero;
+						} else {
+							// No empty bit found?
+							throw new AssertionError();
+						}
+					}
+				}
+				
+				assert !v.getBoolean(k);
+
+				// Set the bit.
+				v.add(k, true);
+
+				assert v.getBoolean(k);
+
+				offsets[i] = fromIndex + k;
+
+				assert offsets[i] < toIndex;
+
+			}
+
+			// put them into sorted order for more efficient traversal.
+			Arrays.sort(offsets);
+
+			// System.err.println(Arrays.toString(offsets));
+
+			return offsets;
+		
+		}
+
+	}
+
+	/**
+	 * An implementation based on an acceptance set of offsets which have been
+	 * accepted. This implementation is a good choice when the limit moderate
+	 * (~100k) and the rangeCount is significantly greater than the limit. The
+	 * memory demand is the O(limit).
+	 * 
+	 * @author thompsonbry
+	 */
+	static public class AcceptanceSetOffsetSampler implements IOffsetSampler {
+
+		public int[] getOffsets(final long seed, int limit,
+				final int fromIndex, final int toIndex) {
+
+			if (limit < 1)
+				throw new IllegalArgumentException();
+			if (fromIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex < 0)
+				throw new IllegalArgumentException();
+			if (toIndex <= fromIndex)
+				throw new IllegalArgumentException();
+
+			final int rangeCount = (toIndex - fromIndex);
+
+			if (limit > rangeCount)
+				limit = rangeCount;
+
+			// offsets of tuples to visit.
+			final int[] offsets = new int[limit];
+
+			// hash set of accepted offsets.
+			final IntOpenHashSet v = new IntOpenHashSet(
+					rangeCount// capacity
+					);
+
+			// Random number generator using caller's seed (if given).
+			final Random rnd = seed == 0L ? new Random() : new Random(seed);
+
+			// Choose random tuple indices for the remaining tuples.
+			for (int i = 0; i < limit; i++) {
+
+				/*
+				 * Look for an unused bit starting at this index. If necessary,
+				 * this will wrap around to zero.
+				 */
+
+				// k in (0:rangeCount-1).
+				int k = rnd.nextInt(rangeCount);
+
+				int round = 0;
+				while (v.contains(k)) {
+
+					k++;
+
+					if (k == rangeCount) {
+						// wrap around.
+						if (++round > 1) {
+							// no empty bit found?
+							throw new AssertionError();
+						}
+						// reset starting index.
+						k = 0;
+					}
+
+				}
+
+				assert !v.contains(k);
+
+				// Set the bit.
+				v.add(k);
+
+				offsets[i] = fromIndex + k;
+
+				assert offsets[i] < toIndex;
+
+			}
+
+			// put them into sorted order for more efficient traversal.
+			Arrays.sort(offsets);
+
+			// System.err.println(Arrays.toString(offsets));
+
+			return offsets;
+
+		}
+
+	}
+
 }

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -60,6 +60,7 @@
 import com.bigdata.bop.NV;
 import com.bigdata.bop.PipelineOp;
 import com.bigdata.bop.ap.SampleIndex;
+import com.bigdata.bop.ap.SampleIndex.SampleType;
 import com.bigdata.bop.bindingSet.HashBindingSet;
 import com.bigdata.bop.engine.IRunningQuery;
 import com.bigdata.bop.engine.LocalChunkMessage;
@@ -534,11 +535,16 @@
 				 * Materialize a random sample from the access path.
 				 */
 
-				final SampleIndex sampleOp = new SampleIndex(
+//				final SampleType sampleType = SampleType.EVEN;
+				final SampleType sampleType = SampleType.RANDOM;
+				
+				final SampleIndex<?> sampleOp = new SampleIndex(
 						new BOp[] {}, //
 						NV.asMap(//
 							new NV(SampleIndex.Annotations.PREDICATE, pred),//
-							new NV(SampleIndex.Annotations.LIMIT, limit)));
+							new NV(SampleIndex.Annotations.LIMIT, limit),//
+							new NV(SampleIndex.Annotations.SAMPLE_TYPE, sampleType.name())//
+							));
 
 				sample = new VertexSample(rangeCount, limit, false/* exact */,
 						sampleOp.eval(context));
@@ -1081,7 +1087,7 @@
 				// @todo Why not use a factory which avoids bopIds already in use?
 				new NV(PipelineJoin.Annotations.PREDICATE, vTarget.pred.setBOpId(3)),
 				// disallow parallel evaluation of tasks.
-				new NV(PipelineJoin.Annotations.MAX_PARALLEL,1),
+				new NV(PipelineOp.Annotations.MAX_PARALLEL,1),
 				// disallow parallel evaluation of chunks.
 				new NV(PipelineJoin.Annotations.MAX_PARALLEL_CHUNKS,0),
 				// disable access path coalescing 
@@ -1172,6 +1178,11 @@
 				 * cardinality at 1600L (lower bound). In fact, the cardinality
 				 * is 16*175000. This falsely low estimate can cause solutions
 				 * which are really better to be dropped.
+				 * 
+				 * @todo we should mark [nout] when we do this show that it
+				 * shows up in the trace!  Also, the rangeCount is sometimes
+				 * falsely high. However, that should be corrected by random
+				 * resampling of the vertices and paths.
 				 */
 				nout = sumRangeCount;
 
@@ -1226,11 +1237,14 @@
 		/**
 		 * The cumulative estimated cardinality of the path. This is zero for an
 		 * empty path. For a path consisting of a single edge, this is the
-		 * estimated cardinality of that edge. When creating a new path adding
-		 * an edge to an existing path, the cumulative cardinality of the new
-		 * path is the cumulative cardinality of the existing path plus the
+		 * estimated cardinality of that edge. When creating a new path by
+		 * adding an edge to an existing path, the cumulative cardinality of the
+		 * new path is the cumulative cardinality of the existing path plus the
 		 * estimated cardinality of the cutoff join of the new edge given the
 		 * input sample of the existing path.
+		 * 
+		 * @todo track this per vertex as well as the total for more interesting
+		 * traces in showPath(Path).
 		 */
 		final public long cumulativeEstimatedCardinality;
 
@@ -1672,7 +1686,7 @@
 	static public String showTable(final Path[] a,final Path[] pruned) {
 		final StringBuilder sb = new StringBuilder();
 		final Formatter f = new Formatter(sb);
-		f.format("%5s %10s%1s * %7s (%3s/%3s) = %10s%1s : %10s %10s",
+		f.format("%5s %10s%1s * %10s (%6s/%6s) = %10s%1s : %10s %10s",
 				"path",//
 				"rangeCount",//
 				"",// sourceSampleExact
@@ -1698,9 +1712,9 @@
 				}
 			}
 			if (x.sample == null) {
-				f.format("p[%2d] %10d%1s * %7s (%3s/%3s) = %10s%1s : %10s", i, "N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A");
+				f.format("p[%2d] %10d%1s * %10s (%6s/%6s) = %10s%1s : %10s", i, "N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A");
 			} else {
-				f.format("p[%2d] %10d%1s * % 7.2f (%3d/%3d) = % 10d%1s : % 10d", i,
+				f.format("p[%2d] %10d%1s * % 10.2f (%6d/%6d) = % 10d%1s : % 10d", i,
 						x.sample.rangeCount,//
 						x.sample.sourceSampleExact?"E":"",//
 						x.sample.f,//
@@ -1730,6 +1744,66 @@
 	}
 
 	/**
+	 * Show the details of a join path, including the estimated cardinality and
+	 * join hit ratio for each step in the path.
+	 * 
+	 * @param p
+	 *            The join path.
+	 */
+	public static String showPath(final Path x) {
+		if(x == null)
+			throw new IllegalArgumentException();
+		final StringBuilder sb = new StringBuilder();
+		final Formatter f = new Formatter(sb);
+		{
+			/*
+			 * @todo show sumEstCard for each step of the path. Only the
+			 * estimate for the current path length is currently preserved. We
+			 * would need to preserve the estimate for each step in the path to
+			 * show it here.
+			 * 
+			 * @todo show limit on EdgeSample?
+			 */
+			f.format("%6s %10s%1s * %10s (%6s/%6s) = %10s%1s",// : %10s",//
+					"edge",
+					"rangeCount",//
+					"",// sourceSampleExact
+					"f",//
+					"out",//
+					"in",//
+					"estCard",//
+					""// estimateIs(Exact|LowerBound|UpperBound)
+//					"sumEstCard",//
+					);
+			int i = 0;
+			for (Edge e : x.edges) {
+				sb.append("\n");
+				if (e.sample == null) {
+					f.format("%6s %10d%1s * %10s (%6s/%6s) = %10s%1s",//
+							e.getLabel(),//
+							"N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A");
+				} else {
+					f.format("%6s %10d%1s * % 10.2f (%6d/%6d) = % 10d%1s",//
+							e.getLabel(),//
+							e.sample.rangeCount,//
+							e.sample.sourceSampleExact ? "E" : "",//
+							e.sample.f,//
+							e.sample.outputCount,//
+							e.sample.inputCount,//
+							e.sample.estimatedCardinality,//
+							e.sample.estimateEnum.getCode()//
+//							e.cumulativeEstimatedCardinality//
+							);
+				}
+//				sb.append("\nv[" + vertexIds[i] + "] " + e.toString());
+				i++;
+			}
+		}
+		sb.append("\n");
+		return sb.toString();
+	}
+	
+	/**
 	 * A runtime optimizer for a join graph. The {@link JoinGraph} bears some
 	 * similarity to ROX (Runtime Optimizer for XQuery), but has several
 	 * significant differences:
@@ -2148,6 +2222,18 @@
 			// Should be one winner.
 			assert paths.length == 1;
 
+			if (log.isInfoEnabled()) {
+
+				/*
+				 * @todo It would be nice to show the plan with the filters
+				 * attached, but that might be something that the caller does.
+				 */
+				log.info("\n*** Selected join path: "
+						+ Arrays.toString(paths[0].getVertexIds()) + "\n"
+						+ showPath(paths[0]));
+
+			}
+			
 			return paths[0];
 
 		}

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -204,9 +204,7 @@
 		 * The maximum #of solutions which will be generated by the join
 		 * (default {@value #DEFAULT_LIMIT}).
 		 * 
-		 * @todo Unit tests for this feature. It currently breaks out of loops
-		 *       but does not explicitly interrupt the task. See the uses of
-		 *       {@link JoinTask#limit}.
+		 * @todo Unit tests for this feature (it is used by the JoinGraph).
 		 */
 		String LIMIT = PipelineJoin.class.getName() + ".limit";
 
@@ -594,6 +592,8 @@
 		/**
 		 * An optional limit on the #of solutions to be produced. The limit is
 		 * ignored if it is {@link Long#MAX_VALUE}.
+		 * 
+		 * @see Annotations#LIMIT
 		 */
 		final private long limit;
 
@@ -808,6 +808,9 @@
 				//            	
 				// stats.elapsed.add(System.currentTimeMillis() - begin);
 
+//			} finally {
+//				System.err.println(joinOp.toString());
+//				System.err.println(stats.toString());
 			}
 
 		}
@@ -1624,6 +1627,12 @@
 				halted();
 
 				if (limit != Long.MAX_VALUE && exactOutputCount.get() > limit) {
+					// break query @ limit.
+					if (log.isInfoEnabled())
+						log.info("Breaking query @ limit: limit=" + limit
+								+ ", exactOutputCount="
+								+ exactOutputCount.get());
+//					halt((Void) null);
 					return null;
 				}
 
@@ -1713,6 +1722,12 @@
 
 							if (limit != Long.MAX_VALUE
 									&& exactOutputCount.incrementAndGet() > limit) {
+								// break query @ limit.
+								if (log.isInfoEnabled())
+									log.info("Breaking query @ limit: limit=" + limit
+											+ ", exactOutputCount="
+											+ exactOutputCount.get());
+//								halt((Void) null);
 								break;
 							}
 
@@ -1927,6 +1942,12 @@
 
 								if (limit != Long.MAX_VALUE
 										&& exactOutputCount.incrementAndGet() > limit) {
+									// break query @ limit.
+									if (log.isInfoEnabled())
+										log.info("Breaking query @ limit: limit=" + limit
+												+ ", exactOutputCount="
+												+ exactOutputCount.get());
+//									halt((Void) null);
 									break;
 								}
 
@@ -2119,6 +2140,12 @@
 
 								if (limit != Long.MAX_VALUE
 										&& exactOutputCount.incrementAndGet() > limit) {
+									// break query @ limit.
+									if (log.isInfoEnabled())
+										log.info("Breaking query @ limit: limit=" + limit
+												+ ", exactOutputCount="
+												+ exactOutputCount.get());
+//									halt((Void) null);
 									break;
 								}
 

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -63,6 +63,18 @@
 
     }
 
+	/**
+	 * Hook for one-time initialization invoked before the advancer visits the
+	 * first tuple. The default implementation simply returns <code>true</code>.
+	 * 
+	 * @return <code>false</code> if nothing should be visited.
+	 */
+    protected boolean init() {
+    	
+    	return true;
+    	
+    }
+    
     /**
      * Offers an opportunity to advance the source {@link ITupleCursor} to a
      * new key using {@link ITupleCursor#seek(byte[]).
@@ -87,6 +99,11 @@
         final private Advancer<E> filter;
 
         /**
+         * Used to invoke {@link Advancer#init()}.
+         */
+        private boolean firstTime = true;
+        
+        /**
          * Set true iff we exceed the bounds on the {@link ITupleCursor}. For
          * example, if we run off the end of an index partition. This is used to
          * simulate the exhaustion of the cursor when you advance past its
@@ -116,6 +133,20 @@
 
         public boolean hasNext() {
 
+        	if(firstTime) {
+    			
+				if (!filter.init()) {
+					
+					exhausted = true;
+					
+					return false;
+					
+        		}
+        		
+        		firstTime =false;
+        		
+        	}
+        	
             if(exhausted) return false;
             
             return src.hasNext();

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -41,6 +41,10 @@
 import com.bigdata.bop.NV;
 import com.bigdata.bop.Var;
 import com.bigdata.bop.IPredicate.Annotations;
+import com.bigdata.bop.ap.SampleIndex.AcceptanceSetOffsetSampler;
+import com.bigdata.bop.ap.SampleIndex.IOffsetSampler;
+import com.bigdata.bop.ap.SampleIndex.SampleType;
+import com.bigdata.bop.ap.SampleIndex.SmartOffsetSampler;
 import com.bigdata.journal.BufferMode;
 import com.bigdata.journal.ITx;
 import com.bigdata.journal.Journal;
@@ -175,8 +179,124 @@
     }
 
 	/**
+	 * Stress test for {@link IOffsetSampler}s.
+	 * 
+	 * TODO Look at the distributions of the different {@link IOffsetSampler}s.
+	 * They should be uniform.
+	 */
+	public void test_offsetSamplers() {
+
+		// Note: Only handles a special case!
+//		new GetOffsetsEntireRange(),
+		
+		final IOffsetSampler[] samplers = new IOffsetSampler[] {
+				new SmartOffsetSampler(), //
+//				new BitVectorOffsetSampler(),//
+				new AcceptanceSetOffsetSampler(),//
+//				new RejectionSetOffsetSampler(), //
+				};
+
+		final Random r = new Random();
+		
+		final int ntrials = 1000;
+		
+		for (int trial = 0; trial < ntrials; trial++) {
+
+			// 10% seed is 0L (which gets turned into random anyway)
+			final long seed = r.nextDouble() < .1 ? 0 : r.nextLong();
+
+			final int entryCount = r.nextInt(100000);
+			
+			// 10% fromIndex is zero.
+			final int fromIndex = r.nextDouble() < .1 ? 0 : r
+					.nextInt(entryCount);
+
+			final int remaining = entryCount - fromIndex;
+
+			final int toIndex = r.nextDouble() < .1 ? entryCount : (fromIndex
+					+ r.nextInt(remaining) + 1);
+
+			final int rangeCount = toIndex - fromIndex;
+			
+			final int limit = r.nextDouble() < .1 ? r.nextInt(100) + 1 : r
+					.nextDouble() < .5 ? r.nextInt(entryCount) + 1 : r
+					.nextInt(10000) + 1;
+			
+			for (IOffsetSampler sampler : samplers) {
+
+				try {
+
+					final long begin = System.currentTimeMillis(); 
+					
+					final int[] offsets = sampler.getOffsets(seed, limit, fromIndex, toIndex);
+					
+					final long elapsed = System.currentTimeMillis() - begin;
+					
+					if (elapsed > 1000) {
+						log.warn("Slow: elapsed=" + elapsed + ", class="
+								+ sampler.getClass() + ", seed=" + seed
+								+ ", limit=" + limit + ", fromIndex="
+								+ fromIndex + ",toIndex=" + toIndex);
+					}
+
+					// check the #of offsets returned.
+					final int noffsets = offsets.length;
+					assertTrue(noffsets <= limit);
+					if (limit > rangeCount)
+						assertTrue(noffsets <= rangeCount);
+					else 
+						assertTrue(noffsets == limit);
+
+					// check offsets ordered, within range, and w/o dups.
+					int lastOffset = -1;
+					for (int j = 0; j < offsets.length; j++) {
+
+						final int offset = offsets[j];
+
+						if (offset < fromIndex)
+							fail("index=" + j
+									+ ", offset LT fromIndex: offset=" + offset
+									+ ", fromIndex=" + fromIndex);
+
+						if (offset >= toIndex)
+							fail("index=" + j + ", offset GTE toIndex: offset="
+									+ offset + ", toIndex=" + toIndex);
+
+						if (offset <= lastOffset) {
+							fail("index=" + j + ", lastOffset=" + lastOffset
+									+ ", but offset=" + offset);
+						}
+						
+						lastOffset = offset;
+
+					}
+					
+				} catch (Throwable t) {
+
+					fail("sampler=" + sampler.getClass() + ", seed=" + seed
+							+ ", limit=" + limit + ", fromIndex=" + fromIndex
+							+ ",toIndex=" + toIndex + ", rangeCount="
+							+ rangeCount, t);
+
+				}
+
+			}
+
+		}
+
+	}
+
+    /**
 	 * Unit test verifies some aspects of a sample taken from a local index
 	 * (primarily that the sample respects the limit).
+	 * 
+	 * @todo test when the range count is zero.
+	 * 
+	 * @todo test when the inclusive lower bound of a key range is an insertion
+	 *       point (no tuple for that key).
+	 * 
+	 * @todo test when the exclusive upper bound of a key range is an insertion
+	 *       point (no tuple for that key).
 	 */
     public void test_something() {
 
@@ -194,42 +314,59 @@
 				new NV(Annotations.TIMESTAMP, ITx.READ_COMMITTED)//
 		);
 
-		final BOpContextBase context = new BOpContextBase(null/* fed */, jnl/* indexManager */);
-
 		final int[] limits = new int[] { //
 		1, 9, 19, 100, 217, 900,// 
 		nrecords, 
 		nrecords + 1
 		};
 
-		for (int limit : limits) {
+		for (SampleType sampleType : SampleType.values()) {
 
-			final SampleIndex<E> sampleOp = new SampleIndex<E>(
-					new BOp[0],
-					NV
-							.asMap(
-									//
-									new NV(SampleIndex.Annotations.PREDICATE,
-											predicate),//
-									new NV(SampleIndex.Annotations.LIMIT, limit)//
-							));
+			if (log.isInfoEnabled())
+				log.info("Testing: SampleType=" + sampleType);
 
-			final E[] a = sampleOp.eval(context);
+			for (int limit : limits) {
 
-//			System.err.println("limit=" + limit + ", nrecords=" + nrecords
-//					+ ", nsamples=" + a.length);
-//			
-//			for (int i = 0; i < a.length && i < 10; i++) {
-//				System.err.println("a[" + i + "]=" + a[i]);
-//			}
+				doTest(nrecords, limit, sampleType, predicate);
 
-			final int nexpected = Math.min(nrecords, limit);
+			}
 
-			assertEquals("#samples (limit=" + limit + ", nrecords=" + nrecords
-					+ ", nexpected=" + nexpected + ")", nexpected, a.length);
-
 		}
 
 	}
+    
+	private void doTest(final int nrecords, final int limit,
+			final SampleType sampleType, final IPredicate<E> predicate) {
 
+		final BOpContextBase context = new BOpContextBase(null/* fed */, jnl/* indexManager */);
+
+		final SampleIndex<E> sampleOp = new SampleIndex<E>( new BOp[0], //
+				NV.asMap(//
+					new NV(SampleIndex.Annotations.PREDICATE, predicate),//
+					new NV(SampleIndex.Annotations.LIMIT, limit),//
+					new NV(SampleIndex.Annotations.SAMPLE_TYPE, sampleType
+							.name())//
+				));
+
+		final E[] a = sampleOp.eval(context);
+
+		if (log.isInfoEnabled()) {
+
+			System.err.println("limit=" + limit + ", nrecords=" + nrecords
+					+ ", nsamples=" + a.length + ", sampleType=" + sampleType);
+
+			for (int i = 0; i < a.length && i < 10; i++) {
+			
+				System.err.println("a[" + i + "]=" + a[i]);
+				
+			}
+			
+		}
+
+		final int nexpected = Math.min(nrecords, limit);
+
+		assertEquals("#samples (limit=" + limit + ", nrecords=" + nrecords
+				+ ", nexpected=" + nexpected + ")", nexpected, a.length);
+    }
+
 }

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java	2011-02-17 22:58:07 UTC (rev 4207)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java	2011-02-18 19:39:00 UTC (rev 4208)
@@ -91,7 +91,7 @@
      * When true, the test uses hardcoded access to an existing Journal already
      * loaded with some BSBM data set.
      */
-	private static final boolean useExistingJournal = true;
+	private static final boolean useExistingJournal = false;
 	
 //    private static final long existingPC = 284826; // BSBM 100M
 
@@ -219,7 +219,7 @@
      */
 	public void test_bsbm_q5() throws Exception {
 
-//		QueryLog.logTableHeader();
+		QueryLog.logTableHeader();
 		
 	    final String namespace = getNamespace();
 	    


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Bigdata-commit] SF.net SVN: bigdata:[4208] branches/QUADS_QUERY_BRANCH/bigdata

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4208] branches/QUADS_QUERY_BRANCH/bigdata