From: <tho...@us...> - 2011-02-24 16:49:52
|
Revision: 4241 http://bigdata.svn.sourceforge.net/bigdata/?rev=4241&view=rev Author: thompsonbry Date: 2011-02-24 16:49:42 +0000 (Thu, 24 Feb 2011) Log Message: ----------- More work on the Runtime optimizer. JGraph: After each round, including the first, the logic now remove any entries from the edgeSamples map which do not correspond to a prefix for a surviving join path. This reduces the heap pressure of the RTO. SampleBase: Encapsulated the [sample] field as a private atomic reference and exposed a method to release the sampled solution set. This reduces heap pressure on the JVM and is forward looking to storing materialized samples on the native C heap using the memory manager. Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/JGraph.java branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/Path.java branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/SampleBase.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/JGraph.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/JGraph.java 2011-02-24 16:42:05 UTC (rev 4240) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/JGraph.java 2011-02-24 16:49:42 UTC (rev 4241) @@ -30,6 +30,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.Formatter; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; @@ -968,9 +969,15 @@ } // next path + /* + * Now examine the set of generated and sampled join paths. If any paths + * span the same vertices then they are alternatives and we can pick the + * best alternative now and prune the other alternatives for those + * vertices. + */ final Path[] paths_tp1 = tmp.toArray(new Path[tmp.size()]); - final Path[] paths_tp1_pruned = pruneJoinPaths(paths_tp1); + final Path[] paths_tp1_pruned = pruneJoinPaths(paths_tp1, edgeSamples); if (log.isDebugEnabled()) log.debug("\n*** round=" + round + ", limit=" + limit @@ -1153,22 +1160,26 @@ /** * Prune paths which are dominated by other paths. Paths are extended in * each round. Paths from previous rounds are always pruned. Of the new - * paths in each round, the following rule is applied to prune the - * search to just those paths which are known to dominate the other - * paths covering the same set of vertices: + * paths in each round, the following rule is applied to prune the search to + * just those paths which are known to dominate the other paths covering the + * same set of vertices: * <p> * If there is a path, [p] != [p1], where [p] is an unordered variant of - * [p1] (that is the vertices of p are the same as the vertices of p1), - * and the cumulative cost of [p] is LTE the cumulative cost of [p1], - * then [p] dominates (or is equivalent to) [p1] and p1 should be - * pruned. + * [p1] (that is the vertices of p are the same as the vertices of p1), and + * the cumulative cost of [p] is LTE the cumulative cost of [p1], then [p] + * dominates (or is equivalent to) [p1] and p1 should be pruned. * * @param a * A set of paths. + * @param edgeSamples + * The set of samples for path segments. Samples which are no + * longer in use after pruning will be cleared from the map and + * their materialized solution sets will be discarded. * * @return The set of paths with all dominated paths removed. */ - public Path[] pruneJoinPaths(final Path[] a) { + public Path[] pruneJoinPaths(final Path[] a, + final Map<PathIds, EdgeSample> edgeSamples) { /* * Find the length of the longest path(s). All shorter paths are * dropped in each round. @@ -1237,13 +1248,84 @@ } } // Pj } // Pi - final Set<Path> keep = new LinkedHashSet<Path>(); - for (Path p : a) { - if (pruned.contains(p)) - continue; - keep.add(p); + /* + * Generate a set of paths which will be retained. + */ + final Path[] b; + { + final Set<Path> keep = new LinkedHashSet<Path>(); + for (Path p : a) { + if (pruned.contains(p)) + continue; + keep.add(p); + } + // convert the retained paths to an array. + b = keep.toArray(new Path[keep.size()]); } - final Path[] b = keep.toArray(new Path[keep.size()]); + /* + * Clear any entries from the edgeSamples map which are not prefixes of + * the retained join paths. + */ + { + + final Iterator<Map.Entry<PathIds, EdgeSample>> itr = edgeSamples + .entrySet().iterator(); + + int ncleared = 0; + while (itr.hasNext()) { + + final Map.Entry<PathIds, EdgeSample> e = itr.next(); + + final PathIds ids = e.getKey(); + + // Consider the retained paths. + boolean found = false; + + for (Path p : b) { + + if (p.beginsWith(ids.ids)) { + + // This sample is still in use. + found = true; + + break; + + } + + } + + if (!found) { + + /* + * Clear sample no longer in use. + * + * Note: In fact, holding onto the sample metadata is + * relatively cheap if there was a reason to do so (it only + * effects the size of the [edgeSamples] map). It is holding + * onto the materialized solution set which puts pressure on + * the heap. + */ + if (log.isTraceEnabled()) + log.trace("Clearing sample: " + ids); + + // release the sampled solution set. + e.getValue().releaseSample(); + + // clear the entry from the array. + itr.remove(); + + ncleared++; + + } + + } + + if (ncleared > 0 && log.isDebugEnabled()) + log.debug("Cleared " + ncleared + " samples"); + + } + + // Return the set of retained paths. return b; } Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/Path.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/Path.java 2011-02-24 16:42:05 UTC (rev 4240) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/Path.java 2011-02-24 16:49:42 UTC (rev 4241) @@ -202,7 +202,7 @@ if (edgeSample == null) throw new IllegalArgumentException(); - if (edgeSample.sample == null) + if (edgeSample.getSample() == null) throw new IllegalArgumentException(); // this.edges = Collections.singletonList(e); @@ -264,7 +264,7 @@ if (edgeSample == null) throw new IllegalArgumentException(); - if (edgeSample.sample == null) + if (edgeSample.getSample() == null) throw new IllegalArgumentException(); // this.edges = Collections.unmodifiableList(edges); @@ -457,9 +457,11 @@ if (p == null) throw new IllegalArgumentException(); - if (vertices.length > p.vertices.length) { + if (vertices.length < p.vertices.length) { + // Proven false since the caller's path is longer. return false; + } for (int i = 0; i < p.vertices.length; i++) { @@ -480,6 +482,43 @@ } /** + * Return <code>true</code> if this path begins with the given path. + * + * @param p + * The given path. + * + * @return <code>true</code> if this path begins with the given path. + * + * @todo unit tests. + */ + public boolean beginsWith(final int[] ids) { + + if (ids == null) + throw new IllegalArgumentException(); + + if (vertices.length < ids.length) { + // Proven false since the caller's path is longer. + return false; + } + + for (int i = 0; i < ids.length; i++) { + + final int idSelf = vertices[i].pred.getId(); + + final int idOther = ids[i]; + + if (idSelf != idOther) { + + return false; + + } + + } + + return true; + } + + /** * Return the first N {@link IPredicate}s in this {@link Path}. * * @param length @@ -658,7 +697,7 @@ if (sourceSample == null) throw new IllegalArgumentException(); - if (sourceSample.sample == null) + if (sourceSample.getSample() == null) throw new IllegalArgumentException(); // Figure out which constraints attach to each predicate. @@ -749,7 +788,7 @@ new LocalChunkMessage<IBindingSet>(queryEngine, queryId, joinOp .getId()/* startId */, -1 /* partitionId */, new ThickAsynchronousIterator<IBindingSet[]>( - new IBindingSet[][] { sourceSample.sample }))); + new IBindingSet[][] { sourceSample.getSample() }))); final List<IBindingSet> result = new LinkedList<IBindingSet>(); try { Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/SampleBase.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/SampleBase.java 2011-02-24 16:42:05 UTC (rev 4240) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/SampleBase.java 2011-02-24 16:49:42 UTC (rev 4241) @@ -27,6 +27,10 @@ package com.bigdata.bop.joinGraph.rto; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.log4j.Logger; + import com.bigdata.bop.IBindingSet; import com.bigdata.rwstore.sector.IMemoryManager; @@ -36,9 +40,22 @@ * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> * @version $Id$ + * + * TODO Large samples should be buffered on the {@link IMemoryManager} + * so they do not pose a burden on the heap. This will require us to + * manage the allocation contexts so we can release samples in a timely + * manner once they are no longer used and always release samples by + * the time the RTO is finished. [There is an additional twist if we + * have fully materialized some part of the join since we no longer + * need to evaluate that path segment. If the RTO can interleave query + * evaluation with exploration then we can take advantage of these + * materialized solutions.] */ public abstract class SampleBase { + private static final transient Logger log = Logger + .getLogger(SampleBase.class); + /** * The estimated cardinality of the underlying access path (for a vertex) or * the join (for a cutoff join). @@ -46,7 +63,7 @@ public final long estimatedCardinality; /** - * The limit used to produce the {@link #sample}. + * The limit used to produce the {@link #getSample() sample}. */ public final int limit; @@ -77,20 +94,39 @@ /** * Sample. + */ + private final AtomicReference<IBindingSet[]> sampleRef = new AtomicReference<IBindingSet[]>(); + + /** + * The sampled solution set. * - * TODO Large samples should be buffered on the {@link IMemoryManager} so - * they do not pose a burden on the heap. This will require us to manage the - * allocation contexts so we can release samples in a timely manner once - * they are no longer used and always release samples by the time the RTO is - * finished. [There is an additional twist if we have fully materialized - * some part of the join since we no longer need to evaluate that path - * segment. If the RTO can interleave query evaluation with exploration - * then we can take advantage of these materialized solutions.] + * @return The sampled solution set -or- <code>null</code> if it has been + * released. */ - final IBindingSet[] sample; + IBindingSet[] getSample() { + + return sampleRef.get(); + + } /** + * Release the sampled solution set. * + * TODO MEMORY MANAGER : release. + */ + void releaseSample() { + + if (sampleRef.getAndSet(null) != null) { + + if (log.isTraceEnabled()) + log.trace("Released sample: " + this); + + } + + } + + /** + * * @param estimatedCardinality * The estimated cardinality. * @param limit @@ -126,7 +162,7 @@ this.estimateEnum = estimateEnum; - this.sample = sample; + this.sampleRef.set(sample); } @@ -147,7 +183,10 @@ sb.append("{estimatedCardinality=" + estimatedCardinality); sb.append(",limit=" + limit); sb.append(",estimateEnum=" + estimateEnum); - sb.append(",sampleSize=" + sample.length); + { + final IBindingSet[] tmp = sampleRef.get(); + sb.append(",sampleSize=" + (tmp != null ? tmp.length : "N/A")); + } toString(sb); // allow extension sb.append("}"); return sb.toString(); Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java 2011-02-24 16:42:05 UTC (rev 4240) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java 2011-02-24 16:49:42 UTC (rev 4241) @@ -24,10 +24,12 @@ import com.bigdata.journal.Journal; import com.bigdata.rdf.internal.XSDIntIV; import com.bigdata.rdf.internal.constraints.CompareBOp; +import com.bigdata.rdf.internal.constraints.Constraint; +import com.bigdata.rdf.internal.constraints.IsBoundBOp; import com.bigdata.rdf.internal.constraints.MathBOp; import com.bigdata.rdf.internal.constraints.NotBOp; import com.bigdata.rdf.internal.constraints.SameTermBOp; -import com.bigdata.rdf.internal.constraints.Constraint; +import com.bigdata.rdf.model.BigdataLiteral; import com.bigdata.rdf.model.BigdataURI; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.model.BigdataValueFactory; @@ -432,19 +434,19 @@ } - /* - * Run w/o constraints. + /* + * Run w/o constraints. + * + * Note: There are no solutions for this query against BSBM 100. The + * optimizer is only providing the fastest path to prove that. We have + * to use a larger data set if we want to verify the optimizers join + * path for a query which produces solutions in the data. * - * Note: There are no solutions for this query against BSBM 100. The - * optimizer is only providing the fastest path to prove that. We have - * to use a larger data set if we want to verify the optimizers join - * path for a query which produces solutions in the data. - * * Note: The optimizer finds the same join path for the BSBM 100, 100M, * and 200M data sets - */ + */ if (true) { - /* + /* 100M: static: ids=[1, 2, 4, 6, 0, 3, 5] *** round=5, limit=600: paths{in=1,considered=1,out=1} @@ -470,14 +472,15 @@ 0 166410 * 1.00 ( 600/ 600/ 600) = 166410 : 998460 [ 1 2 0 4 6 3 5 ] test_bsbm_q5 : Total times: static=8871, runtime=8107, delta(static-runtime)=764 - */ + */ final IPredicate<?>[] runtimeOrder = doTest(preds, null/* constraints */); -// assertEquals("runtimeOrder", new int[] { 1, 2, 0, 4, 6, 3, 5 }, BOpUtility.getPredIds(runtimeOrder)); + assertEquals("runtimeOrder", new int[] { 1, 2, 0, 4, 6, 3, 5 }, + BOpUtility.getPredIds(runtimeOrder)); } // Run w/ constraints. if(true){ - /* + /* 100M: static: ids=[1, 2, 4, 6, 0, 3, 5] *** round=5, limit=600: paths{in=4,considered=4,out=1} @@ -503,12 +506,233 @@ 0 1941 * 1.00 ( 7/ 7/ 7) = 1941 : 344799 [ 1 2 4 3 6 5 0 ] test_bsbm_q5 : Total times: static=7312, runtime=3305, delta(static-runtime)=4007 - - */ + + */ final IPredicate<?>[] runtimeOrder = doTest(preds, constraints); -// assertEquals("runtimeOrder", new int[] { 1, 2, 0, 4, 6, 3, 5 }, BOpUtility.getPredIds(runtimeOrder)); + assertEquals("runtimeOrder", new int[] { 1, 2, 4, 3, 6, 5, 0 }, + BOpUtility.getPredIds(runtimeOrder)); } } + /** + * BSBM Q3 + * + * <pre> + * PREFIX bsbm-inst: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/instances/> + * PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/> + * PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> + * PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + * + * SELECT ?product ?label + * WHERE { + * ?product rdfs:label ?label . + * ?product a %ProductType% . + * ?product bsbm:productFeature %ProductFeature1% . + * ?product bsbm:productPropertyNumeric1 ?p1 . + * FILTER ( ?p1 > %x% ) + * ?product bsbm:productPropertyNumeric3 ?p3 . + * FILTER (?p3 < %y% ) + * OPTIONAL { + * ?product bsbm:productFeature %ProductFeature2% . + * ?product rdfs:label ?testVar } + * FILTER (!bound(?testVar)) + * } + * ORDER BY ?label + * LIMIT 10 + * </pre> + */ + public void test_bsbm_q3() throws Exception { + + fail("This test needs instance data for BSBM 100 and 100M"); + + QueryLog.logTableHeader(); + + final String namespace = getNamespace(); + + final AbstractTripleStore database = getDatabase(namespace); + + /* + * Resolve terms against the lexicon. + */ + final BigdataValueFactory valueFactory = database.getLexiconRelation() + .getValueFactory(); + + final String rdfs = "http://www.w3.org/2000/01/rdf-schema#"; + final String rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + final String bsbm = "http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/"; + final String bsbmInst ="http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/instances/"; + + final BigdataURI rdfType = valueFactory.createURI(rdf + "type"); + + final BigdataURI rdfsLabel = valueFactory.createURI(rdfs + "label"); + + final BigdataURI productFeature = valueFactory.createURI(bsbm + + "productFeature"); + + final BigdataURI productPropertyNumeric1 = valueFactory.createURI(bsbm + + "productPropertyNumeric1"); + + final BigdataURI productPropertyNumeric3 = valueFactory.createURI(bsbm + + "productPropertyNumeric3"); + + // FIXME parameters + final BigdataURI productType = valueFactory.createURI(productInstance); + final BigdataURI productFeature1 = valueFactory.createURI(productInstance); + final BigdataURI productFeature2 = valueFactory.createURI(productInstance); + final BigdataLiteral x = valueFactory.createLiteral(productInstance); + final BigdataLiteral y = valueFactory.createLiteral(productInstance); + + final BigdataValue[] terms = new BigdataValue[] { rdfType, rdfsLabel, + productFeature, productPropertyNumeric1, + productPropertyNumeric3, productType, productFeature1, + productFeature2, x, y }; + + // resolve terms. + database.getLexiconRelation() + .addTerms(terms, terms.length, true/* readOnly */); + + { + for (BigdataValue tmp : terms) { + System.out.println(tmp + " : " + tmp.getIV()); + if (tmp.getIV() == null) + throw new RuntimeException("Not defined: " + tmp); + } + } + + final IConstraint[] constraints; + final IPredicate[] preds; + final IPredicate p0, p1, p2, p3, p4, p5, p6; + final IConstraint c0, c1, c2; + { + final IVariable product = Var.var("product"); + final IVariable label = Var.var("label"); + final IVariable p1Var = Var.var("p1"); + final IVariable p3Var = Var.var("p3"); + final IVariable testVar = Var.var("testVar"); + + // The name space for the SPO relation. + final String[] spoRelation = new String[] { namespace + ".spo" }; + +// // The name space for the Lexicon relation. +// final String[] lexRelation = new String[] { namespace + ".lex" }; + + final long timestamp = database.getIndexManager().getLastCommitTime(); + + int nextId = 0; + +// ?product rdfs:label ?label . + p0 = new SPOPredicate(new BOp[] {// + product, + new Constant(rdfsLabel.getIV()), + label// + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // ?product a %ProductType% . + p1 = new SPOPredicate(new BOp[] {// + product, + new Constant(rdfType.getIV()), + new Constant(productType.getIV())// + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // ?product bsbm:productFeature %ProductFeature1% . + p2 = new SPOPredicate(new BOp[] {// + product, + new Constant(productFeature.getIV()), + new Constant(productFeature1.getIV())// + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // ?product bsbm:productPropertyNumeric1 ?p1 . + p3 = new SPOPredicate(new BOp[] {// + product, + new Constant(productPropertyNumeric1.getIV()), + p1Var// + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // ?product bsbm:productPropertyNumeric3 ?p3 . + p4 = new SPOPredicate(new BOp[] {// + product, + new Constant(productPropertyNumeric3.getIV()), + p3Var// + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + /* + * FIXME (p5,p6) below in an optional join group! + */ + + // ?product bsbm:productFeature %ProductFeature2% . + p5 = new SPOPredicate(new BOp[] {// + product, + new Constant(productFeature.getIV()), + new Constant(productFeature2.getIV()), + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // ?product rdfs:label ?testVar } + p6 = new SPOPredicate(new BOp[] {// + product, + new Constant(rdfsLabel.getIV()), + testVar, + },// + new NV(BOp.Annotations.BOP_ID, nextId++),// + new NV(Annotations.TIMESTAMP, timestamp),// + new NV(IPredicate.Annotations.RELATION_NAME, spoRelation)// + ); + + // the vertices of the join graph (the predicates). + preds = new IPredicate[] { p0, p1, p2, p3, p4, p5, p6 }; + + // FILTER ( ?p1 > %x% ) + c0 = Constraint.wrap(new CompareBOp(new BOp[] { p1Var, + new Constant(x.getIV()) }, NV.asMap(new NV[] { new NV( + CompareBOp.Annotations.OP, CompareOp.GT) }))); + + // FILTER (?p3 < %y% ) + c1 = Constraint.wrap(new CompareBOp(new BOp[] { p3Var, + new Constant(y.getIV()) }, NV.asMap(new NV[] { new NV( + CompareBOp.Annotations.OP, CompareOp.LT) }))); + + // FILTER (!bound(?testVar)) + c2 = Constraint.wrap(new NotBOp(new IsBoundBOp(testVar))); + + // the constraints on the join graph. + constraints = new IConstraint[] { c0, c1, c2 }; + + } + + /* + * Run the join graph w/ its constraints (?p1>%x% and ?p3<%y%), but not + * the optional join group nor its constraint (!bound(?testVar)). + * + * FIXME The optional join group is part of the tail plan and can not be + * fed into the RTO right now. + */ + final IPredicate<?>[] runtimeOrder = doTest(preds, new IConstraint[] { + c0, c1 }); + + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |