From: <tho...@us...> - 2011-02-18 19:39:08
|
Revision: 4208 http://bigdata.svn.sourceforge.net/bigdata/?rev=4208&view=rev Author: thompsonbry Date: 2011-02-18 19:39:00 +0000 (Fri, 18 Feb 2011) Log Message: ----------- Added support for random sampling of the standalone database B+Tree indices in support of the Runtime Query Optimizer. Modified the Advancer pattern to permit one-time initialization of the advancer. Added information about the selected join path to the JoinGraph INFO trace. Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/SampleIndex.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -27,10 +27,16 @@ package com.bigdata.bop.ap; +import it.unimi.dsi.bits.BitVector; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.ints.IntOpenHashSet; + import java.io.Serializable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.Map; +import java.util.Random; import java.util.concurrent.Callable; import com.bigdata.bop.AbstractAccessPathOp; @@ -45,6 +51,7 @@ import com.bigdata.btree.ITupleCursor; import com.bigdata.btree.filter.Advancer; import com.bigdata.btree.view.FusedView; +import com.bigdata.rawstore.Bytes; import com.bigdata.relation.IRelation; import com.bigdata.relation.accesspath.AccessPath; import com.bigdata.relation.accesspath.IAccessPath; @@ -79,6 +86,27 @@ private static final long serialVersionUID = 1L; /** + * Typesafe enumeration of different kinds of index sampling strategies. + * + * @todo It is much more efficient to take clusters of samples when you can + * accept the bias. Taking a clustered sample really requires knowing + * where the leaf boundaries are in the index, e.g., using + * {@link ILeafCursor}. Taking all tuples from a few leaves in each + * sample might produce a faster estimation of the correlation when + * sampling join paths. + */ + public static enum SampleType { + /** + * Samples are taken at even space offsets. + */ + EVEN, + /** + * Sample offsets are computed randomly. + */ + RANDOM; + } + + /** * Known annotations. */ public interface Annotations extends BOp.Annotations { @@ -86,16 +114,32 @@ /** * The sample limit (default {@value #DEFAULT_LIMIT}). */ - String LIMIT = "limit"; + String LIMIT = SampleIndex.class.getName() + ".limit"; int DEFAULT_LIMIT = 100; /** + * The random number generator seed -or- ZERO (0L) for a random seed + * (default {@value #DEFAULT_SEED}). A non-zero value may be used to + * create a repeatable sample. + */ + String SEED = SampleIndex.class.getName() + ".seed"; + + long DEFAULT_SEED = 0L; + + /** * The {@link IPredicate} describing the access path to be sampled * (required). */ String PREDICATE = SampleIndex.class.getName() + ".predicate"; + + /** + * The type of sample to take (default {@value #DEFAULT_SAMPLE_TYPE)}. + */ + String SAMPLE_TYPE = SampleIndex.class.getName() + ".sampleType"; + String DEFAULT_SAMPLE_TYPE = SampleType.RANDOM.name(); + } public SampleIndex(SampleIndex<E> op) { @@ -115,7 +159,20 @@ return getProperty(Annotations.LIMIT, Annotations.DEFAULT_LIMIT); } + + public long seed() { + return getProperty(Annotations.SEED, Annotations.DEFAULT_SEED); + + } + + public SampleType getSampleType() { + + return SampleType.valueOf(getProperty(Annotations.SAMPLE_TYPE, + Annotations.DEFAULT_SAMPLE_TYPE)); + + } + @SuppressWarnings("unchecked") public IPredicate<E> getPredicate() { @@ -195,7 +252,7 @@ /** Return a sample from the access path. */ public E[] call() throws Exception { - return sample(limit(), getPredicate()).getSample(); + return sample(limit(), getSampleType(), getPredicate()).getSample(); } @@ -206,7 +263,7 @@ * @return */ public AccessPathSample<E> sample(final int limit, - IPredicate<E> predicate) { + final SampleType sampleType, IPredicate<E> predicate) { final IRelation<E> relation = context.getRelation(predicate); @@ -242,10 +299,25 @@ /* * Add advancer to collect sample. */ + + final Advancer<E> advancer; + switch (sampleType) { + case EVEN: + advancer = new EvenSampleAdvancer<E>(// rangeCount, + limit, accessPath.getFromKey(), accessPath.getToKey()); + break; + case RANDOM: + advancer = new RandomSampleAdvancer<E>(// rangeCount, + seed(), limit, accessPath.getFromKey(), accessPath + .getToKey()); + break; + default: + throw new UnsupportedOperationException("SampleType=" + + sampleType); + } + predicate = ((Predicate<E>) predicate) - .addIndexLocalFilter(new SampleAdvancer<E>(//rangeCount, - limit, accessPath.getFromKey(), accessPath - .getToKey())); + .addIndexLocalFilter(advancer); return new AccessPathSample<E>(limit, context.getAccessPath( relation, predicate)); @@ -256,20 +328,21 @@ /** * An advancer pattern which is designed to take evenly distributed samples - * from an index. The caller specifies the #of tuples to be skipped after - * each tuple visited. That number should be computed based on the estimated - * range count of the index and the desired sample size. This can fail to - * gather the desired number of sample if additional filters are applied - * which further restrict the elements selected by the predicate. However, - * it will still faithfully represent the expected cardinality of the - * sampled access path. + * from an index. The caller specifies the #of tuples to be sampled. This + * class estimates the range count of the access path and then computes the + * #of samples to be skipped after each tuple visited. + * <p> + * Note: This can fail to gather the desired number of sample if additional + * filters are applied which further restrict the elements selected by the + * predicate. However, it will still faithfully represent the expected + * cardinality of the sampled access path (tuples tested). * * @author tho...@us... * * @param <E> * The generic type of the elements visited by that access path. */ - private static class SampleAdvancer<E> extends Advancer<E> { + private static class EvenSampleAdvancer<E> extends Advancer<E> { private static final long serialVersionUID = 1L; @@ -296,30 +369,13 @@ * @param limit * The #of samples to visit. */ - public SampleAdvancer(final int limit, final byte[] fromKey, + public EvenSampleAdvancer(final int limit, final byte[] fromKey, final byte[] toKey) { this.limit = limit; this.toKey = toKey; } - /** - * @todo This is taking evenly spaced samples. It is much more efficient - * to take clusters of samples when you can accept the bias. - * Taking a clustered sample really requires knowing where the - * leaf boundaries are in the index, e.g., using - * {@link ILeafCursor}. - * <p> - * Taking all tuples from a few leaves in each sample might - * produce a faster estimation of the correlation when sampling - * join paths. - * - * @todo Rather than evenly spaced samples, we should be taking a random - * sample. This could be achieved using a random initial offset - * and random increment as long as the initial offset was in the - * range of a single increment and we compute the increment such - * that N+1 intervals exist. - */ @Override protected void advance(final ITuple<E> tuple) { @@ -336,6 +392,11 @@ toIndex = toKey == null ? ndx.getEntryCount() : ndx .indexOf(toKey); + if (toIndex < 0) { + // convert insert position to index. + toIndex = -toIndex + 1; + } + final int rangeCount = (toIndex - fromIndex); skipCount = Math.max(1, rangeCount / limit); @@ -365,9 +426,125 @@ } - } // class SampleAdvancer + } // class EvenSampleAdvancer /** + * An advancer pattern which is designed to take randomly distributed + * samples from an index. The caller specifies the #of tuples to be sampled. + * This class estimates the range count of the access path and then computes + * a set of random offsets into the access path from which it will collect + * the desired #of samples. + * <p> + * Note: This can fail to gather the desired number of sample if additional + * filters are applied which further restrict the elements selected by the + * predicate. However, it will still faithfully represent the expected + * cardinality of the sampled access path (tuples tested). + * + * @author tho...@us... + * + * @param <E> + * The generic type of the elements visited by that access path. + */ + private static class RandomSampleAdvancer<E> extends Advancer<E> { + + private static final long serialVersionUID = 1L; + + /** The random number generator seed. */ + private final long seed; + + /** The desired total limit on the sample. */ + private final int limit; + + private final byte[] fromKey, toKey; + + /* + * Transient data. This gets initialized when we visit the first tuple. + */ + + /** The offset of each tuple to be sampled. */ + private transient int[] offsets; + /** The #of tuples accepted so far. */ + private transient int nread = 0; + /** The inclusive lower bound of the first tuple actually visited. */ + private transient int fromIndex; + /** The exclusive upper bound of the last tuple which could be visited. */ + private transient int toIndex; + + /** + * + * @param limit + * The #of samples to visit. + */ + public RandomSampleAdvancer(final long seed, final int limit, + final byte[] fromKey, final byte[] toKey) { + + this.seed = seed; + this.limit = limit; + this.fromKey = fromKey; + this.toKey = toKey; + } + + @Override + protected boolean init() { + + final AbstractBTree ndx = (AbstractBTree) src.getIndex(); + + // inclusive lower bound. + fromIndex = fromKey == null ? 0 : ndx.indexOf(fromKey); + + if (fromIndex < 0) { + // convert insert position to index. + fromIndex = -fromIndex + 1; + } + + // exclusive upper bound. + toIndex = toKey == null ? ndx.getEntryCount() : ndx.indexOf(toKey); + + if (toIndex < 0) { + // convert insert position to index. + toIndex = -toIndex + 1; + } + + // get offsets to be sampled. + offsets = new SmartOffsetSampler().getOffsets(seed, limit, + fromIndex, toIndex); + + // Skip to the first tuple. + src.seek(ndx.keyAt(offsets[0])); + + return true; + + } + + @Override + protected void advance(final ITuple<E> tuple) { + + final AbstractBTree ndx = (AbstractBTree) src.getIndex(); + + if (nread < offsets.length - 1) { + + /* + * Skip to the next tuple. + */ + + final int nextIndex = offsets[nread]; + +// System.err.println("limit=" + limit + ", rangeCount=" +// + (toIndex - fromIndex) + ", fromIndex=" + fromIndex +// + ", toIndex=" + toIndex + ", currentIndex=" +// + currentIndex + ", nextIndex=" + nextIndex); + + src.seek(ndx.keyAt(nextIndex)); + + } + + nread++; + + } + + } // class RandomSampleAdvancer + + /** * A sample from an access path. * * @param <E> @@ -459,4 +636,355 @@ } // AccessPathSample + /** + * Interface for obtaining an array of tuple offsets to be sampled. + * + * @author thompsonbry + */ + public interface IOffsetSampler { + + /** + * Return an array of tuple indices which may be used to sample a key + * range of some index. + * <p> + * Note: The caller must stop when it runs out of offsets, not when the + * limit is satisfied, as there will be fewer offsets returned when the + * half open range is smaller than the limit. + * + * @param seed + * The seed for the random number generator -or- ZERO (0L) + * for a random seed. A non-zero value may be used to create + * a repeatable sample. + * @param limit + * The maximum #of tuples to sample. + * @param fromIndex + * The inclusive lower bound. + * @param toIndex + * The exclusive upper bound0 + * + * @return An array of at most <i>limit</i> offsets into the index. The + * offsets will lie in the half open range (fromIndex,toIndex]. + * The elements of the array will be in ascending order. No + * offsets will be repeated. + * + * @throws IllegalArgumentException + * if <i>limit</i> is non-positive. + * @throws IllegalArgumentException + * if <i>fromIndex</i> is negative. + * @throws IllegalArgumentException + * if <i>toIndex</i> is negative. + * @throws IllegalArgumentException + * unless <i>toIndex</i> is GT <i>fromIndex</i>. + */ + int[] getOffsets(final long seed, int limit, final int fromIndex, + final int toIndex); + } + + /** + * A smart implementation which uses whichever implementation is most + * efficient for the limit and key range to be sampled. + * + * @author thompsonbry + */ + public static class SmartOffsetSampler implements IOffsetSampler { + + /** + * {@inheritDoc} + */ + public int[] getOffsets(final long seed, int limit, + final int fromIndex, final int toIndex) { + + if (limit < 1) + throw new IllegalArgumentException(); + if (fromIndex < 0) + throw new IllegalArgumentException(); + if (toIndex < 0) + throw new IllegalArgumentException(); + if (toIndex <= fromIndex) + throw new IllegalArgumentException(); + + final int rangeCount = (toIndex - fromIndex); + + if (limit > rangeCount) + limit = rangeCount; + + if (limit == rangeCount) { + + // Visit everything. + return new EntireRangeOffsetSampler().getOffsets(seed, limit, + fromIndex, toIndex); + + } + + /* + * Random offsets visiting a subset of the key range using a + * selection without replacement pattern (the same tuple is never + * visited twice). + * + * FIXME When the limit approaches the range count and the range + * count is large (too large for a bit vector or acceptance set + * approach), then we are better off creating a hash set of offsets + * NOT to be visited and then just choosing (rangeCount-limit) + * offsets to reject. This will be less expensive than computing the + * acceptance set directly. However, to really benefit from the + * smaller memory profile, we would also need to wrap that with an + * iterator pattern so the smaller memory representation could be of + * use when the offset[] is applied (e.g., modify the IOffsetSampler + * interface to be an iterator with various ctor parameters rather + * than returning an array as we do today). + */ + + // FIXME BitVectorOffsetSampler is broken. + if (false && rangeCount < Bytes.kilobyte32 * 8) { + + // NB: 32k range count uses a 4k bit vector. + return new BitVectorOffsetSampler().getOffsets(seed, limit, + fromIndex, toIndex); + + } + + /* + * When limit is small (or significantly smaller than the + * rangeCount), then we are much better off creating a hash set of + * the offsets which have been accepted. + * + * Good unless [limit] is very large. + */ + return new AcceptanceSetOffsetSampler().getOffsets(seed, limit, + fromIndex, toIndex); + + } + + } + + /** + * Returns all offsets in the half-open range, but may only be used when + * the limit GTE the range count. + */ + static public class EntireRangeOffsetSampler implements IOffsetSampler { + + /** + * {@inheritDoc} + * + * @throws UnsupportedOperationException + * if <i>limit!=rangeCount</i> (after adjusting for limits + * greater than the rangeCount). + */ + public int[] getOffsets(final long seed, int limit, + final int fromIndex, final int toIndex) { + + if (limit < 1) + throw new IllegalArgumentException(); + if (fromIndex < 0) + throw new IllegalArgumentException(); + if (toIndex < 0) + throw new IllegalArgumentException(); + if (toIndex <= fromIndex) + throw new IllegalArgumentException(); + + final int rangeCount = (toIndex - fromIndex); + + if (limit > rangeCount) + limit = rangeCount; + + if (limit != rangeCount) + throw new UnsupportedOperationException(); + + // offsets of tuples to visit. + final int[] offsets = new int[limit]; + + for (int i = 0; i < limit; i++) { + + offsets[i] = fromIndex + i; + + } + + return offsets; + + } + } + + /** + * Return a randomly selected ordered array of offsets in the given + * half-open range. + * <p> + * This approach is based on a bit vector. If the bit is already marked, + * then the offset has been used and we scan until we find the next free + * offset. This requires [rangeCount] bits, so it works well when the + * rangeCount of the key range is small. For example, a range count of 32k + * requires a 4kb bit vector, which is quite manageable. + * + * FIXME There is something broken in this class, probably an assumption I + * have about how {@link LongArrayBitVector} works. If you enable it in the + * stress test, it will fail. + */ + static public class BitVectorOffsetSampler implements IOffsetSampler { + + public int[] getOffsets(final long seed, int limit, + final int fromIndex, final int toIndex) { + + if (limit < 1) + throw new IllegalArgumentException(); + if (fromIndex < 0) + throw new IllegalArgumentException(); + if (toIndex < 0) + throw new IllegalArgumentException(); + if (toIndex <= fromIndex) + throw new IllegalArgumentException(); + + final int rangeCount = (toIndex - fromIndex); + + if (limit > rangeCount) + limit = rangeCount; + + // offsets of tuples to visit. + final int[] offsets = new int[limit]; + + // create a cleared bit vector of the stated capacity. + final BitVector v = LongArrayBitVector.ofLength(// + rangeCount// capacity (in bits) + ); + + // Random number generator using caller's seed (if given). + final Random rnd = seed == 0L ? new Random() : new Random(seed); + + // Choose random tuple indices for the remaining tuples. + for (int i = 0; i < limit; i++) { + + /* + * Look for an unused bit starting at this index. If necessary, + * this will wrap around to zero. + */ + + // k in (0:rangeCount-1). + int k = rnd.nextInt(rangeCount); + + if (v.getBoolean((long) k)) { + // This bit is already taken. + final long nextZero = v.nextZero((long) k); + if (nextZero != -1L) { + k = (int) nextZero; + } else { + final long priorZero = v.previousZero((long) k); + if (priorZero != -1L) { + k = (int) priorZero; + } else { + // No empty bit found? + throw new AssertionError(); + } + } + } + + assert !v.getBoolean(k); + + // Set the bit. + v.add(k, true); + + assert v.getBoolean(k); + + offsets[i] = fromIndex + k; + + assert offsets[i] < toIndex; + + } + + // put them into sorted order for more efficient traversal. + Arrays.sort(offsets); + + // System.err.println(Arrays.toString(offsets)); + + return offsets; + + } + + } + + /** + * An implementation based on an acceptance set of offsets which have been + * accepted. This implementation is a good choice when the limit moderate + * (~100k) and the rangeCount is significantly greater than the limit. The + * memory demand is the O(limit). + * + * @author thompsonbry + */ + static public class AcceptanceSetOffsetSampler implements IOffsetSampler { + + public int[] getOffsets(final long seed, int limit, + final int fromIndex, final int toIndex) { + + if (limit < 1) + throw new IllegalArgumentException(); + if (fromIndex < 0) + throw new IllegalArgumentException(); + if (toIndex < 0) + throw new IllegalArgumentException(); + if (toIndex <= fromIndex) + throw new IllegalArgumentException(); + + final int rangeCount = (toIndex - fromIndex); + + if (limit > rangeCount) + limit = rangeCount; + + // offsets of tuples to visit. + final int[] offsets = new int[limit]; + + // hash set of accepted offsets. + final IntOpenHashSet v = new IntOpenHashSet( + rangeCount// capacity + ); + + // Random number generator using caller's seed (if given). + final Random rnd = seed == 0L ? new Random() : new Random(seed); + + // Choose random tuple indices for the remaining tuples. + for (int i = 0; i < limit; i++) { + + /* + * Look for an unused bit starting at this index. If necessary, + * this will wrap around to zero. + */ + + // k in (0:rangeCount-1). + int k = rnd.nextInt(rangeCount); + + int round = 0; + while (v.contains(k)) { + + k++; + + if (k == rangeCount) { + // wrap around. + if (++round > 1) { + // no empty bit found? + throw new AssertionError(); + } + // reset starting index. + k = 0; + } + + } + + assert !v.contains(k); + + // Set the bit. + v.add(k); + + offsets[i] = fromIndex + k; + + assert offsets[i] < toIndex; + + } + + // put them into sorted order for more efficient traversal. + Arrays.sort(offsets); + + // System.err.println(Arrays.toString(offsets)); + + return offsets; + + } + + } + } Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -60,6 +60,7 @@ import com.bigdata.bop.NV; import com.bigdata.bop.PipelineOp; import com.bigdata.bop.ap.SampleIndex; +import com.bigdata.bop.ap.SampleIndex.SampleType; import com.bigdata.bop.bindingSet.HashBindingSet; import com.bigdata.bop.engine.IRunningQuery; import com.bigdata.bop.engine.LocalChunkMessage; @@ -534,11 +535,16 @@ * Materialize a random sample from the access path. */ - final SampleIndex sampleOp = new SampleIndex( +// final SampleType sampleType = SampleType.EVEN; + final SampleType sampleType = SampleType.RANDOM; + + final SampleIndex<?> sampleOp = new SampleIndex( new BOp[] {}, // NV.asMap(// new NV(SampleIndex.Annotations.PREDICATE, pred),// - new NV(SampleIndex.Annotations.LIMIT, limit))); + new NV(SampleIndex.Annotations.LIMIT, limit),// + new NV(SampleIndex.Annotations.SAMPLE_TYPE, sampleType.name())// + )); sample = new VertexSample(rangeCount, limit, false/* exact */, sampleOp.eval(context)); @@ -1081,7 +1087,7 @@ // @todo Why not use a factory which avoids bopIds already in use? new NV(PipelineJoin.Annotations.PREDICATE, vTarget.pred.setBOpId(3)), // disallow parallel evaluation of tasks. - new NV(PipelineJoin.Annotations.MAX_PARALLEL,1), + new NV(PipelineOp.Annotations.MAX_PARALLEL,1), // disallow parallel evaluation of chunks. new NV(PipelineJoin.Annotations.MAX_PARALLEL_CHUNKS,0), // disable access path coalescing @@ -1172,6 +1178,11 @@ * cardinality at 1600L (lower bound). In fact, the cardinality * is 16*175000. This falsely low estimate can cause solutions * which are really better to be dropped. + * + * @todo we should mark [nout] when we do this show that it + * shows up in the trace! Also, the rangeCount is sometimes + * falsely high. However, that should be corrected by random + * resampling of the vertices and paths. */ nout = sumRangeCount; @@ -1226,11 +1237,14 @@ /** * The cumulative estimated cardinality of the path. This is zero for an * empty path. For a path consisting of a single edge, this is the - * estimated cardinality of that edge. When creating a new path adding - * an edge to an existing path, the cumulative cardinality of the new - * path is the cumulative cardinality of the existing path plus the + * estimated cardinality of that edge. When creating a new path by + * adding an edge to an existing path, the cumulative cardinality of the + * new path is the cumulative cardinality of the existing path plus the * estimated cardinality of the cutoff join of the new edge given the * input sample of the existing path. + * + * @todo track this per vertex as well as the total for more interesting + * traces in showPath(Path). */ final public long cumulativeEstimatedCardinality; @@ -1672,7 +1686,7 @@ static public String showTable(final Path[] a,final Path[] pruned) { final StringBuilder sb = new StringBuilder(); final Formatter f = new Formatter(sb); - f.format("%5s %10s%1s * %7s (%3s/%3s) = %10s%1s : %10s %10s", + f.format("%5s %10s%1s * %10s (%6s/%6s) = %10s%1s : %10s %10s", "path",// "rangeCount",// "",// sourceSampleExact @@ -1698,9 +1712,9 @@ } } if (x.sample == null) { - f.format("p[%2d] %10d%1s * %7s (%3s/%3s) = %10s%1s : %10s", i, "N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A"); + f.format("p[%2d] %10d%1s * %10s (%6s/%6s) = %10s%1s : %10s", i, "N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A"); } else { - f.format("p[%2d] %10d%1s * % 7.2f (%3d/%3d) = % 10d%1s : % 10d", i, + f.format("p[%2d] %10d%1s * % 10.2f (%6d/%6d) = % 10d%1s : % 10d", i, x.sample.rangeCount,// x.sample.sourceSampleExact?"E":"",// x.sample.f,// @@ -1730,6 +1744,66 @@ } /** + * Show the details of a join path, including the estimated cardinality and + * join hit ratio for each step in the path. + * + * @param p + * The join path. + */ + public static String showPath(final Path x) { + if(x == null) + throw new IllegalArgumentException(); + final StringBuilder sb = new StringBuilder(); + final Formatter f = new Formatter(sb); + { + /* + * @todo show sumEstCard for each step of the path. Only the + * estimate for the current path length is currently preserved. We + * would need to preserve the estimate for each step in the path to + * show it here. + * + * @todo show limit on EdgeSample? + */ + f.format("%6s %10s%1s * %10s (%6s/%6s) = %10s%1s",// : %10s",// + "edge", + "rangeCount",// + "",// sourceSampleExact + "f",// + "out",// + "in",// + "estCard",// + ""// estimateIs(Exact|LowerBound|UpperBound) +// "sumEstCard",// + ); + int i = 0; + for (Edge e : x.edges) { + sb.append("\n"); + if (e.sample == null) { + f.format("%6s %10d%1s * %10s (%6s/%6s) = %10s%1s",// + e.getLabel(),// + "N/A", "", "N/A", "N/A", "N/A", "N/A", "", "N/A"); + } else { + f.format("%6s %10d%1s * % 10.2f (%6d/%6d) = % 10d%1s",// + e.getLabel(),// + e.sample.rangeCount,// + e.sample.sourceSampleExact ? "E" : "",// + e.sample.f,// + e.sample.outputCount,// + e.sample.inputCount,// + e.sample.estimatedCardinality,// + e.sample.estimateEnum.getCode()// +// e.cumulativeEstimatedCardinality// + ); + } +// sb.append("\nv[" + vertexIds[i] + "] " + e.toString()); + i++; + } + } + sb.append("\n"); + return sb.toString(); + } + + /** * A runtime optimizer for a join graph. The {@link JoinGraph} bears some * similarity to ROX (Runtime Optimizer for XQuery), but has several * significant differences: @@ -2148,6 +2222,18 @@ // Should be one winner. assert paths.length == 1; + if (log.isInfoEnabled()) { + + /* + * @todo It would be nice to show the plan with the filters + * attached, but that might be something that the caller does. + */ + log.info("\n*** Selected join path: " + + Arrays.toString(paths[0].getVertexIds()) + "\n" + + showPath(paths[0])); + + } + return paths[0]; } Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/join/PipelineJoin.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -204,9 +204,7 @@ * The maximum #of solutions which will be generated by the join * (default {@value #DEFAULT_LIMIT}). * - * @todo Unit tests for this feature. It currently breaks out of loops - * but does not explicitly interrupt the task. See the uses of - * {@link JoinTask#limit}. + * @todo Unit tests for this feature (it is used by the JoinGraph). */ String LIMIT = PipelineJoin.class.getName() + ".limit"; @@ -594,6 +592,8 @@ /** * An optional limit on the #of solutions to be produced. The limit is * ignored if it is {@link Long#MAX_VALUE}. + * + * @see Annotations#LIMIT */ final private long limit; @@ -808,6 +808,9 @@ // // stats.elapsed.add(System.currentTimeMillis() - begin); +// } finally { +// System.err.println(joinOp.toString()); +// System.err.println(stats.toString()); } } @@ -1624,6 +1627,12 @@ halted(); if (limit != Long.MAX_VALUE && exactOutputCount.get() > limit) { + // break query @ limit. + if (log.isInfoEnabled()) + log.info("Breaking query @ limit: limit=" + limit + + ", exactOutputCount=" + + exactOutputCount.get()); +// halt((Void) null); return null; } @@ -1713,6 +1722,12 @@ if (limit != Long.MAX_VALUE && exactOutputCount.incrementAndGet() > limit) { + // break query @ limit. + if (log.isInfoEnabled()) + log.info("Breaking query @ limit: limit=" + limit + + ", exactOutputCount=" + + exactOutputCount.get()); +// halt((Void) null); break; } @@ -1927,6 +1942,12 @@ if (limit != Long.MAX_VALUE && exactOutputCount.incrementAndGet() > limit) { + // break query @ limit. + if (log.isInfoEnabled()) + log.info("Breaking query @ limit: limit=" + limit + + ", exactOutputCount=" + + exactOutputCount.get()); +// halt((Void) null); break; } @@ -2119,6 +2140,12 @@ if (limit != Long.MAX_VALUE && exactOutputCount.incrementAndGet() > limit) { + // break query @ limit. + if (log.isInfoEnabled()) + log.info("Breaking query @ limit: limit=" + limit + + ", exactOutputCount=" + + exactOutputCount.get()); +// halt((Void) null); break; } Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/btree/filter/Advancer.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -63,6 +63,18 @@ } + /** + * Hook for one-time initialization invoked before the advancer visits the + * first tuple. The default implementation simply returns <code>true</code>. + * + * @return <code>false</code> if nothing should be visited. + */ + protected boolean init() { + + return true; + + } + /** * Offers an opportunity to advance the source {@link ITupleCursor} to a * new key using {@link ITupleCursor#seek(byte[]). @@ -87,6 +99,11 @@ final private Advancer<E> filter; /** + * Used to invoke {@link Advancer#init()}. + */ + private boolean firstTime = true; + + /** * Set true iff we exceed the bounds on the {@link ITupleCursor}. For * example, if we run off the end of an index partition. This is used to * simulate the exhaustion of the cursor when you advance past its @@ -116,6 +133,20 @@ public boolean hasNext() { + if(firstTime) { + + if (!filter.init()) { + + exhausted = true; + + return false; + + } + + firstTime =false; + + } + if(exhausted) return false; return src.hasNext(); Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/TestSampleIndex.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -41,6 +41,10 @@ import com.bigdata.bop.NV; import com.bigdata.bop.Var; import com.bigdata.bop.IPredicate.Annotations; +import com.bigdata.bop.ap.SampleIndex.AcceptanceSetOffsetSampler; +import com.bigdata.bop.ap.SampleIndex.IOffsetSampler; +import com.bigdata.bop.ap.SampleIndex.SampleType; +import com.bigdata.bop.ap.SampleIndex.SmartOffsetSampler; import com.bigdata.journal.BufferMode; import com.bigdata.journal.ITx; import com.bigdata.journal.Journal; @@ -175,8 +179,124 @@ } /** + * Stress test for {@link IOffsetSampler}s. + * + * TODO Look at the distributions of the different {@link IOffsetSampler}s. + * They should be uniform. + */ + public void test_offsetSamplers() { + + // Note: Only handles a special case! +// new GetOffsetsEntireRange(), + + final IOffsetSampler[] samplers = new IOffsetSampler[] { + new SmartOffsetSampler(), // +// new BitVectorOffsetSampler(),// + new AcceptanceSetOffsetSampler(),// +// new RejectionSetOffsetSampler(), // + }; + + final Random r = new Random(); + + final int ntrials = 1000; + + for (int trial = 0; trial < ntrials; trial++) { + + // 10% seed is 0L (which gets turned into random anyway) + final long seed = r.nextDouble() < .1 ? 0 : r.nextLong(); + + final int entryCount = r.nextInt(100000); + + // 10% fromIndex is zero. + final int fromIndex = r.nextDouble() < .1 ? 0 : r + .nextInt(entryCount); + + final int remaining = entryCount - fromIndex; + + final int toIndex = r.nextDouble() < .1 ? entryCount : (fromIndex + + r.nextInt(remaining) + 1); + + final int rangeCount = toIndex - fromIndex; + + final int limit = r.nextDouble() < .1 ? r.nextInt(100) + 1 : r + .nextDouble() < .5 ? r.nextInt(entryCount) + 1 : r + .nextInt(10000) + 1; + + for (IOffsetSampler sampler : samplers) { + + try { + + final long begin = System.currentTimeMillis(); + + final int[] offsets = sampler.getOffsets(seed, limit, fromIndex, toIndex); + + final long elapsed = System.currentTimeMillis() - begin; + + if (elapsed > 1000) { + log.warn("Slow: elapsed=" + elapsed + ", class=" + + sampler.getClass() + ", seed=" + seed + + ", limit=" + limit + ", fromIndex=" + + fromIndex + ",toIndex=" + toIndex); + } + + // check the #of offsets returned. + final int noffsets = offsets.length; + assertTrue(noffsets <= limit); + if (limit > rangeCount) + assertTrue(noffsets <= rangeCount); + else + assertTrue(noffsets == limit); + + // check offsets ordered, within range, and w/o dups. + int lastOffset = -1; + for (int j = 0; j < offsets.length; j++) { + + final int offset = offsets[j]; + + if (offset < fromIndex) + fail("index=" + j + + ", offset LT fromIndex: offset=" + offset + + ", fromIndex=" + fromIndex); + + if (offset >= toIndex) + fail("index=" + j + ", offset GTE toIndex: offset=" + + offset + ", toIndex=" + toIndex); + + if (offset <= lastOffset) { + fail("index=" + j + ", lastOffset=" + lastOffset + + ", but offset=" + offset); + } + + lastOffset = offset; + + } + + } catch (Throwable t) { + + fail("sampler=" + sampler.getClass() + ", seed=" + seed + + ", limit=" + limit + ", fromIndex=" + fromIndex + + ",toIndex=" + toIndex + ", rangeCount=" + + rangeCount, t); + + } + + } + + } + + } + + /** * Unit test verifies some aspects of a sample taken from a local index * (primarily that the sample respects the limit). + * + * @todo test when the range count is zero. + * + * @todo test when the inclusive lower bound of a key range is an insertion + * point (no tuple for that key). + * + * @todo test when the exclusive upper bound of a key range is an insertion + * point (no tuple for that key). */ public void test_something() { @@ -194,42 +314,59 @@ new NV(Annotations.TIMESTAMP, ITx.READ_COMMITTED)// ); - final BOpContextBase context = new BOpContextBase(null/* fed */, jnl/* indexManager */); - final int[] limits = new int[] { // 1, 9, 19, 100, 217, 900,// nrecords, nrecords + 1 }; - for (int limit : limits) { + for (SampleType sampleType : SampleType.values()) { - final SampleIndex<E> sampleOp = new SampleIndex<E>( - new BOp[0], - NV - .asMap( - // - new NV(SampleIndex.Annotations.PREDICATE, - predicate),// - new NV(SampleIndex.Annotations.LIMIT, limit)// - )); + if (log.isInfoEnabled()) + log.info("Testing: SampleType=" + sampleType); - final E[] a = sampleOp.eval(context); + for (int limit : limits) { -// System.err.println("limit=" + limit + ", nrecords=" + nrecords -// + ", nsamples=" + a.length); -// -// for (int i = 0; i < a.length && i < 10; i++) { -// System.err.println("a[" + i + "]=" + a[i]); -// } + doTest(nrecords, limit, sampleType, predicate); - final int nexpected = Math.min(nrecords, limit); + } - assertEquals("#samples (limit=" + limit + ", nrecords=" + nrecords - + ", nexpected=" + nexpected + ")", nexpected, a.length); - } } + + private void doTest(final int nrecords, final int limit, + final SampleType sampleType, final IPredicate<E> predicate) { + final BOpContextBase context = new BOpContextBase(null/* fed */, jnl/* indexManager */); + + final SampleIndex<E> sampleOp = new SampleIndex<E>( new BOp[0], // + NV.asMap(// + new NV(SampleIndex.Annotations.PREDICATE, predicate),// + new NV(SampleIndex.Annotations.LIMIT, limit),// + new NV(SampleIndex.Annotations.SAMPLE_TYPE, sampleType + .name())// + )); + + final E[] a = sampleOp.eval(context); + + if (log.isInfoEnabled()) { + + System.err.println("limit=" + limit + ", nrecords=" + nrecords + + ", nsamples=" + a.length + ", sampleType=" + sampleType); + + for (int i = 0; i < a.length && i < 10; i++) { + + System.err.println("a[" + i + "]=" + a[i]); + + } + + } + + final int nexpected = Math.min(nrecords, limit); + + assertEquals("#samples (limit=" + limit + ", nrecords=" + nrecords + + ", nexpected=" + nexpected + ")", nexpected, a.length); + } + } Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java 2011-02-17 22:58:07 UTC (rev 4207) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java 2011-02-18 19:39:00 UTC (rev 4208) @@ -91,7 +91,7 @@ * When true, the test uses hardcoded access to an existing Journal already * loaded with some BSBM data set. */ - private static final boolean useExistingJournal = true; + private static final boolean useExistingJournal = false; // private static final long existingPC = 284826; // BSBM 100M @@ -219,7 +219,7 @@ */ public void test_bsbm_q5() throws Exception { -// QueryLog.logTableHeader(); + QueryLog.logTableHeader(); final String namespace = getNamespace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |