[Bigdata-commit] SF.net SVN: bigdata:[4218] branches/QUADS_QUERY_BRANCH/bigdata

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4218
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4218&view=rev
Author:   thompsonbry
Date:     2011-02-21 22:01:49 +0000 (Mon, 21 Feb 2011)

Log Message:
-----------
- Working on join graphs and the runtime query optimizer.

- Moved JoinGraph, NoSolutionsException, and PartitionedJoinGraph into
  the com.bigdata.bop.joinGraph package.

- Moved IEvaluationPlan, IEvaluationPlanFactory,
  DefaultEvaluationPlan, etc. into the com.bigdata.bop.joinGraph
  package.

- Moved BOpUtility into the com.bigdata.bop.util package.

- Added partial support for dynamically identifying edges based on constraints and for accepting unconstrained edges once there are no other vertices available to extend the join graph.

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/IPredicate.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/package.html
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/AbstractJoinNexus.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/AbstractJoinNexusFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IJoinNexus.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IJoinNexusFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IRuleState.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IRuleStatisticsFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/RuleState.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/RuleStats.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/MockJoinNexusFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/ap/filter/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/controller/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/relation/rule/eval/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/inf/OwlSameAsPropertiesExpandingIterator.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/rules/InferenceEngine.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/rules/RDFJoinNexus.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/rules/RDFJoinNexusFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/AbstractTripleStore.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/AbstractJoinGraphTestCase.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBSBMData.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnBarData.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/bop/rdf/joinGraph/TestJoinGraphOnLubm.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/internal/constraints/TestInlineConstraints.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/magic/TestIRIS.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/AbstractRuleTestCase.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestOptionals.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestRuleExpansion.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/spo/TestSPORelation.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/spo/TestSPOStarJoin.java
    branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/Rule2BOpUtility.java

Added Paths:
-----------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/DefaultRangeCountFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/FixedEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/IEvaluationPlan.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/IEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/IRangeCountFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/NOPEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/NoReorderEvaluationPlan.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/NoSolutionsException.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/PartitionedJoinGroup.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/fast/
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/fast/DefaultEvaluationPlan2.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/fast/DefaultEvaluationPlanFactory2.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/package.html
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/joinGraph/rto/JoinGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/util/
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/TestPartitionedJoinGroup.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/fast/
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/fast/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/fast/TestDefaultEvaluationPlan.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/rto/
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/rto/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/rto/TestJGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/joinGraph/rto/TestJoinGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/TestAll.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/TestBOpUtility.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/TestBOpUtility_canJoin.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/TestBOpUtility_canJoinUsingConstraints.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/util/TestBOpUtility_sharedVariables.java

Removed Paths:
-------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/NoSolutionsException.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/PartitionedJoinGroup.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/DefaultEvaluationPlan2.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/DefaultEvaluationPlanFactory2.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/DefaultRangeCountFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/FixedEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IEvaluationPlan.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/IRangeCountFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/NOPEvaluationPlanFactory.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/relation/rule/eval/NoReorderEvaluationPlan.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/TestBOpUtility.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/TestBOpUtility_canJoin.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/TestBOpUtility_canJoinUsingConstraints.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/controller/TestJGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/controller/TestJoinGraph.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/bop/controller/TestPartitionedJoinGroup.java
    branches/QUADS_QUERY_BRANCH/bigdata/src/test/com/bigdata/relation/rule/eval/TestDefaultEvaluationPlan.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java	2011-02-21 18:33:22 UTC (rev 4217)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java	2011-02-21 22:01:49 UTC (rev 4218)
@@ -40,8 +40,8 @@
 import org.apache.log4j.Logger;
 
 import com.bigdata.bop.BOp.Annotations;
-import com.bigdata.bop.controller.PartitionedJoinGroup;
 import com.bigdata.bop.engine.BOpStats;
+import com.bigdata.bop.joinGraph.PartitionedJoinGroup;
 import com.bigdata.btree.AbstractNode;
 import com.bigdata.relation.accesspath.IAsynchronousIterator;
 import com.bigdata.relation.accesspath.IBlockingBuffer;
@@ -1381,6 +1381,9 @@
             /*
              * Find the constraints that will run with each vertex of the new
              * join path.
+             * 
+             * TODO This is a forward reference to a different package, so maybe
+             * move the canJoinWithConstraints() method to that package?
              */
             final IConstraint[][] constraintRunArray = PartitionedJoinGroup
                     .getJoinGraphConstraints(newPath, constraints);

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/IPredicate.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/IPredicate.java	2011-02-21 18:33:22 UTC (rev 4217)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/IPredicate.java	2011-02-21 22:01:49 UTC (rev 4218)
@@ -34,6 +34,7 @@
 import com.bigdata.bop.ap.filter.BOpFilterBase;
 import com.bigdata.bop.ap.filter.BOpTupleFilter;
 import com.bigdata.bop.ap.filter.DistinctFilter;
+import com.bigdata.bop.joinGraph.IEvaluationPlan;
 import com.bigdata.btree.IRangeQuery;
 import com.bigdata.btree.ITuple;
 import com.bigdata.btree.ITupleCursor;
@@ -47,7 +48,6 @@
 import com.bigdata.relation.accesspath.IAccessPath;
 import com.bigdata.relation.rule.IAccessPathExpander;
 import com.bigdata.relation.rule.IRule;
-import com.bigdata.relation.rule.eval.IEvaluationPlan;
 import com.bigdata.relation.rule.eval.pipeline.JoinMasterTask;
 import com.bigdata.service.ndx.IClientIndex;
 import com.bigdata.striterator.IKeyOrder;

Deleted: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java	2011-02-21 18:33:22 UTC (rev 4217)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/controller/JoinGraph.java	2011-02-21 22:01:49 UTC (rev 4218)
@@ -1,3260 +0,0 @@
-/**
-
-Copyright (C) SYSTAP, LLC 2006-2010.  All rights reserved.
-
-Contact:
-     SYSTAP, LLC
-     4501 Tower Road
-     Greensboro, NC 27410
-     lic...@bi...
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; version 2 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-/*
- * Created on Aug 16, 2010
- */
-
-package com.bigdata.bop.controller;
-
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Formatter;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.UUID;
-import java.util.concurrent.Callable;
-import java.util.concurrent.FutureTask;
-
-import org.apache.log4j.Logger;
-
-import com.bigdata.bop.BOp;
-import com.bigdata.bop.BOpContext;
-import com.bigdata.bop.BOpContextBase;
-import com.bigdata.bop.BOpEvaluationContext;
-import com.bigdata.bop.BOpIdFactory;
-import com.bigdata.bop.BOpUtility;
-import com.bigdata.bop.IBindingSet;
-import com.bigdata.bop.IConstraint;
-import com.bigdata.bop.IElement;
-import com.bigdata.bop.IPredicate;
-import com.bigdata.bop.IVariable;
-import com.bigdata.bop.NV;
-import com.bigdata.bop.PipelineOp;
-import com.bigdata.bop.ap.SampleIndex;
-import com.bigdata.bop.ap.SampleIndex.SampleType;
-import com.bigdata.bop.bindingSet.HashBindingSet;
-import com.bigdata.bop.engine.IRunningQuery;
-import com.bigdata.bop.engine.LocalChunkMessage;
-import com.bigdata.bop.engine.QueryEngine;
-import com.bigdata.bop.join.PipelineJoin;
-import com.bigdata.bop.join.PipelineJoin.PipelineJoinStats;
-import com.bigdata.bop.rdf.join.DataSetJoin;
-import com.bigdata.relation.IRelation;
-import com.bigdata.relation.accesspath.BufferClosedException;
-import com.bigdata.relation.accesspath.IAccessPath;
-import com.bigdata.relation.accesspath.IAsynchronousIterator;
-import com.bigdata.relation.accesspath.ThickAsynchronousIterator;
-import com.bigdata.striterator.Dechunkerator;
-import com.bigdata.striterator.IChunkedIterator;
-import com.bigdata.util.concurrent.Haltable;
-
-/**
- * A join graph with annotations for estimated cardinality and other details in
- * support of runtime query optimization. A join graph is a collection of
- * relations and joins which connect those relations. This boils down to a
- * collection of {@link IPredicate}s (selects on relations), shared variables
- * (which identify joins), and {@link IConstraint}s (which limit solutions).
- * Operators other than standard joins (including optional joins, sort, order
- * by, etc.) must be handled downstream from the join graph in a "tail plan".
- * 
- * @see http://arxiv.org/PS_cache/arxiv/pdf/0810/0810.4809v1.pdf, XQuery Join
- *      Graph Isolation.
- * 
- * @author <a href="mailto:tho...@us...">Bryan Thompson</a>
- * @version $Id$
- * 
- * @todo Examine the overhead of the runtime optimizer. Look at ways to prune
- *       its costs. For example, by pruning the search, by recognizing when the
- *       query is simple enough to execute directly, by recognizing when we have
- *       already materialized the answer to the query, etc.
- * 
- * @todo Cumulative estimated cardinality is an estimate of the work to be done.
- *       However, the actual cost of a join depends on whether we will use
- *       nested index subquery or a hash join and the cost of that operation on
- *       the database. There could be counter examples where the cost of the
- *       hash join with a range scan using the unbound variable is LT the nested
- *       index subquery. For those cases, we will do the same amount of IO on
- *       the hash join but there will still be a lower cardinality to the join
- *       path since we are feeding in fewer solutions to be joined.
- * 
- * @todo Look at the integration with the SAIL. We decorate the joins with some
- *       annotations. Those will have to be correctly propagated to the "edges"
- *       in order for edge sampling and incremental evaluation (or final
- *       evaluation) to work. The {@link DataSetJoin} essentially inlines one of
- *       its access paths. That should really be changed into an inline access
- *       path and a normal join operator so we can defer some of the details
- *       concerning the join operator annotations until we decide on the join
- *       path to be executed. An inline AP really implies an inline relation,
- *       which in turn implies that the query is a searchable context for
- *       query-local resources.
- *       <p>
- *       For s/o, when the AP is remote, the join evaluation context must be ANY
- *       and otherwise (for s/o) it must be SHARDED.
- *       <p>
- *       Since the join graph is fed the vertices (APs), it does not have access
- *       to the annotated joins so we need to generated appropriately annotated
- *       joins when sampling an edge and when evaluation a subquery.
- *       <p>
- *       One solution would be to always use the unpartitioned views of the
- *       indices for the runtime query optimizer, which is how we are estimating
- *       the range counts of the access paths right now. [Note that the static
- *       query optimizer ignores named and default graphs, while the runtime
- *       query optimizer SHOULD pay attention to these things and exploit their
- *       conditional selectivity for the query plan.]
- * 
- * @todo Handle optional join graphs by first applying the runtime optimizer to
- *       the main join graph and obtaining a sample for the selected join path.
- *       That sample will then be feed into the the optional join graph in order
- *       to optimize the join order within the optional join graph (a join order
- *       which is selective in the optional join graph is better since it will
- *       result in faster rejections of intermediate results and hence do less
- *       work).
- *       <p>
- *       This is very much related to accepting a collection of non-empty
- *       binding sets when running the join graph. However, optional join graph
- *       should be presented in combination with the original join graph and the
- *       starting paths must be constrained to have the selected join path for
- *       the original join graph as a prefix. With this setup, the original join
- *       graph has been locked in to a specific join path and the sampling of
- *       edges and vertices for the optional join graph can proceed normally.
- *       <p>
- *       True optionals will always be appended as part of the "tail plan" for
- *       any join graph and can not be optimized as each optional join must run
- *       regardless (as long as the intermediate solution survives the
- *       non-optional joins).
- * 
- * @todo There are two cases where a join graph must be optimized against a
- *       specific set of inputs. In one case, it is a sample (this is how
- *       optimization of an optional join group proceeds per above). In the
- *       other case, the set of inputs is fixed and is provided instead of a
- *       single empty binding set as the starting condition. This second case is
- *       actually a bit more complicated since we can not use a random sample of
- *       vertices unless the do not share any variables with the initial binding
- *       sets. When there is a shared variable, we need to do a cutoff join of
- *       the edge with the initial binding sets. When there is not a shared
- *       variable, we can sample the vertex and then do a cutoff join.
- * 
- * @todo When we run into a cardinality estimation underflow (the expected
- *       cardinality goes to zero) we could double the sample size for just
- *       those join paths which hit a zero estimated cardinality and re-run them
- *       within the round. This would imply that we keep per join path limits.
- *       The vertex and edge samples are already aware of the limit at which
- *       they were last sampled so this should not cause any problems there.
- *       <p>
- *       A related option would be to deepen the samples only when we are in
- *       danger of cardinality estimation underflow. E.g., a per-path limit.
- *       Resampling vertices may only make sense when we increase the limit
- *       since otherwise we may find a different correlation with the new sample
- *       but the comparison of paths using one sample base with paths using a
- *       different sample base in a different round does not carry forward the
- *       cardinality estimates from the prior round (unless we do something like
- *       a weighted moving average).
- * 
- * @todo When comparing choices among join paths having fully bound tails where
- *       the estimated cardinality has also gone to zero, we should prefer to
- *       evaluate vertices in the tail with better index locality first. For
- *       example, if one vertex had one variable in the original plan while
- *       another had two variables, then solutions which reach the 2-var vertex
- *       could be spread out over a much wider range of the selected index than
- *       those which reach the 1-var vertex. [In order to support this, we would
- *       need a means to indicate that a fully bound access path should use an
- *       index specified by the query optimizer rather than the primary index
- *       for the relation. In addition, this suggests that we should keep bloom
- *       filters for more than just the SPO(C) index in scale-out.]
- * 
- * @todo Examine behavior when we do not have perfect covering indices. This
- *       will mean that some vertices can not be sampled using an index and that
- *       estimation of their cardinality will have to await the estimation of
- *       the cardinality of the edge(s) leading to that vertex. Still, the
- *       approach should be able to handle queries without perfect / covering
- *       automatically. Then experiment with carrying fewer statement indices
- *       for quads.
- * 
- * @todo Unit test when there are no solutions to the query. In this case there
- *       will be no paths identified by the optimizer and the final path length
- *       becomes zero.
- */
-public class JoinGraph extends PipelineOp {
-
-	private static final transient Logger log = Logger
-			.getLogger(JoinGraph.class);
-
-	private static final long serialVersionUID = 1L;
-
-	/**
-	 * Known annotations.
-	 */
-	public interface Annotations extends PipelineOp.Annotations {
-
-        /**
-         * The vertices of the join graph, expressed an an {@link IPredicate}[]
-         * (required).
-         */
-        String VERTICES = JoinGraph.class.getName() + ".vertices";
-
-        /**
-         * The constraints on the join graph, expressed an an
-         * {@link IConstraint}[] (optional, defaults to no constraints).
-         */
-        String CONSTRAINTS = JoinGraph.class.getName() + ".constraints";
-
-        /**
-         * The initial limit for cutoff sampling (default
-         * {@value #DEFAULT_LIMIT}).
-         */
-        String LIMIT = JoinGraph.class.getName() + ".limit";
-
-		int DEFAULT_LIMIT = 100;
-
-		/**
-		 * The <i>nedges</i> edges of the join graph having the lowest
-		 * cardinality will be used to generate the initial join paths (default
-		 * {@value #DEFAULT_NEDGES}). This must be a positive integer.
-		 */
-		String NEDGES = JoinGraph.class.getName() + ".nedges";
-
-		int DEFAULT_NEDGES = 2;
-	}
-
-	/**
-	 * @see Annotations#VERTICES
-	 */
-	public IPredicate<?>[] getVertices() {
-
-		return (IPredicate[]) getRequiredProperty(Annotations.VERTICES);
-
-	}
-
-    /**
-     * @see Annotations#CONSTRAINTS
-     */
-    public IConstraint[] getConstraints() {
-
-        return (IConstraint[]) getProperty(Annotations.CONSTRAINTS, null/* none */);
-
-    }
-
-	/**
-	 * @see Annotations#LIMIT
-	 */
-	public int getLimit() {
-
-		return getProperty(Annotations.LIMIT, Annotations.DEFAULT_LIMIT);
-
-	}
-
-	/**
-	 * @see Annotations#NEDGES
-	 */
-	public int getNEdges() {
-
-		return getProperty(Annotations.NEDGES, Annotations.DEFAULT_NEDGES);
-
-	}
-
-    public JoinGraph(final BOp[] args, final NV... anns) {
-
-        this(args, NV.asMap(anns));
-
-	}
-
-    public JoinGraph(final BOp[] args, final Map<String, Object> anns) {
-
-        super(args, anns);
-
-        // required property.
-        final IPredicate<?>[] vertices = (IPredicate[]) getProperty(Annotations.VERTICES);
-
-        if (vertices == null)
-            throw new IllegalArgumentException(Annotations.VERTICES);
-
-        if (vertices.length == 0)
-            throw new IllegalArgumentException(Annotations.VERTICES);
-
-        if (getLimit() <= 0)
-            throw new IllegalArgumentException(Annotations.LIMIT);
-
-        if (getNEdges() <= 0)
-            throw new IllegalArgumentException(Annotations.NEDGES);
-
-        if (!isController())
-            throw new IllegalArgumentException();
-
-        switch (getEvaluationContext()) {
-        case CONTROLLER:
-            break;
-        default:
-            throw new IllegalArgumentException(Annotations.EVALUATION_CONTEXT
-                    + "=" + getEvaluationContext());
-        }
-
-	}
-
-	public FutureTask<Void> eval(final BOpContext<IBindingSet> context) {
-
-		return new FutureTask<Void>(new JoinGraphTask(context));
-
-	}
-
-	/**
-	 * A sample of a {@link Vertex} (an access path).
-	 */
-	public static class VertexSample {
-
-		/**
-		 * Fast range count. This will be the same for each sample taken
-		 * (assuming a read historical view or even a time scale of query which
-		 * is significantly faster than update).
-		 */
-		public final long rangeCount;
-
-		/**
-		 * The limit used to produce the {@link #sample}.
-		 */
-		public final int limit;
-
-		/**
-		 * When <code>true</code>, the result is not a sample but the
-		 * materialized access path.
-		 * 
-		 * TODO When <code>true</code>, we could run the join against the sample
-		 * rather than the disk. This would require wrapping the sample as an
-		 * access path. Since all exact samples will be pretty small, this is
-		 * not likely to have any great performance benefit.
-		 */
-		public final boolean exact;
-
-		/**
-		 * Sample.
-		 */
-		final Object[] sample;
-
-		/**
-		 * 
-		 * @param rangeCount
-		 * @param limit
-		 * @param exact
-		 * @param sample
-		 */
-		public VertexSample(final long rangeCount, final int limit,
-				final boolean exact, final Object[] sample) {
-
-			if (rangeCount < 0L)
-				throw new IllegalArgumentException();
-
-			if (limit <= 0)
-				throw new IllegalArgumentException();
-
-			if (sample == null)
-				throw new IllegalArgumentException();
-
-			this.rangeCount = rangeCount;
-
-			this.limit = limit;
-
-			this.exact = exact;
-
-			this.sample = sample;
-
-		}
-
-		public String toString() {
-			return "VertexSample{rangeCount=" + rangeCount + ",limit=" + limit
-					+ ",exact=" + exact + ", sampleSize=" + sample.length + "}";
-		}
-
-	}
-
-	/**
-	 * A vertex of the join graph is an annotated relation (this corresponds to
-	 * an {@link IPredicate} with additional annotations to support the adaptive
-	 * query optimization algorithm).
-	 * <p>
-	 * The unique identifier for a {@link Vertex} (within a given join graph) is
-	 * the {@link BOp.Annotations#BOP_ID} decorating its {@link IPredicate}.
-	 * {@link #hashCode()} is defined in terms of this unique identifier so we
-	 * can readily detect when a {@link Set} already contains a given
-	 * {@link Vertex}.
-	 */
-	public static class Vertex implements Serializable {
-
-		/**
-         * 
-         */
-		private static final long serialVersionUID = 1L;
-
-		public final IPredicate<?> pred;
-
-		/**
-		 * The most recently taken sample of the {@link Vertex}.
-		 */
-		transient VertexSample sample = null;
-
-		Vertex(final IPredicate<?> pred) {
-
-			if (pred == null)
-				throw new IllegalArgumentException();
-
-			this.pred = pred;
-
-		}
-
-		public String toString() {
-
-			return "Vertex{pred=" + pred + ",sample=" + sample + "}";
-
-		}
-
-		/**
-		 * Equals is based on a reference test.
-		 */
-		public boolean equals(Object o) {
-			return this == o;
-		}
-
-		/**
-		 * The hash code is just the {@link BOp.Annotations#BOP_ID} of the
-		 * associated {@link IPredicate}.
-		 */
-		public int hashCode() {
-			return pred.getId();
-		}
-
-		/**
-		 * Take a sample of the vertex, updating {@link #sample} as a
-		 * side-effect. If the sample is already exact, then this is a NOP. If
-		 * the vertex was already sampled to that limit, then this is a NOP (you
-		 * have to raise the limit to re-sample the vertex).
-		 * 
-		 * @param limit
-		 *            The sample cutoff.
-		 */
-		public void sample(final QueryEngine queryEngine, final int limit) {
-
-			if (queryEngine == null)
-				throw new IllegalArgumentException();
-
-			if (limit <= 0)
-				throw new IllegalArgumentException();
-
-			final VertexSample oldSample = this.sample;
-
-			if (oldSample != null && oldSample.exact) {
-
-				/*
-				 * The old sample is already the full materialization of the
-				 * vertex.
-				 */
-
-				return;
-
-			}
-
-			if (oldSample != null && oldSample.limit >= limit) {
-
-				/*
-				 * The vertex was already sampled to this limit. 
-				 */
-				
-				return;
-				
-			}
-			
-			final BOpContextBase context = new BOpContextBase(queryEngine);
-			
-			final IRelation r = context.getRelation(pred);
-
-			final IAccessPath ap = context.getAccessPath(r, pred);
-
-			final long rangeCount = oldSample == null ? ap
-					.rangeCount(false/* exact */) : oldSample.rangeCount;
-
-			if (rangeCount <= limit) {
-
-				/*
-				 * Materialize the access path.
-				 * 
-				 * TODO This could be more efficient if we raised it onto the AP
-				 * or if we overrode CHUNK_CAPACITY and the fully buffered
-				 * iterator threshold such that everything was materialized as a
-				 * single chunk.
-				 */
-
-				final List<Object> tmp = new ArrayList<Object>((int) rangeCount);
-
-				final IChunkedIterator<Object> itr = ap.iterator();
-
-				try {
-
-					while (itr.hasNext()) {
-
-						tmp.add(itr.next());
-
-					}
-
-				} finally {
-
-					itr.close();
-				}
-
-				sample = new VertexSample(rangeCount, limit, true/* exact */,
-						tmp.toArray(new Object[0]));
-
-			} else {
-
-				/*
-				 * Materialize a random sample from the access path.
-				 */
-
-//				final SampleType sampleType = SampleType.EVEN;
-				final SampleType sampleType = SampleType.RANDOM;
-				
-				final SampleIndex<?> sampleOp = new SampleIndex(
-						new BOp[] {}, //
-						NV.asMap(//
-							new NV(SampleIndex.Annotations.PREDICATE, pred),//
-							new NV(SampleIndex.Annotations.LIMIT, limit),//
-							new NV(SampleIndex.Annotations.SAMPLE_TYPE, sampleType.name())//
-							));
-
-				sample = new VertexSample(rangeCount, limit, false/* exact */,
-						sampleOp.eval(context));
-
-			}
-
-			if (log.isTraceEnabled())
-				log.trace("Sampled: " + sample);
-
-			return;
-			
-		}
-
-	}
-
-	/**
-	 * Type safe enumeration describes the edge condition (if any) for a
-	 * cardinality estimate.
-	 */
-	public static enum EstimateEnum {
-		/**
-		 * An estimate, but not any of the edge conditions.
-		 */
-		Normal(" "),
-		/**
-		 * The cardinality estimate is exact.
-		 */
-		Exact("E"),
-		/**
-		 * The cardinality estimation is a lower bound (the actual cardinality
-		 * may be higher than the estimated value).
-		 */
-		LowerBound("L"),
-		/**
-		 * Flag is set when the cardinality estimate underflowed (false zero
-		 * (0)).
-		 */
-		Underflow("U");
-		
-		private EstimateEnum(final String code) {
-			
-			this.code = code;
-			
-		}
-
-		private final String code;
-
-		public String getCode() {
-
-			return code;
-			
-		}
-		
-	} // EstimateEnum
-	
-	/**
-	 * A sample of an {@link Edge} (a join).
-	 */
-	public static class EdgeSample {
-
-		/**
-		 * The fast range count (aka cardinality) for the source vertex of the
-		 * edge (whichever vertex has the lower cardinality).
-		 */
-		public final long rangeCount;
-
-		/**
-		 * <code>true</code> iff the source sample is exact (because the source
-		 * is either a fully materialized vertex or an edge whose solutions have
-		 * been fully materialized).
-		 */
-		public final boolean sourceSampleExact;
-		
-		/**
-		 * The limit used to sample the edge (this is the limit on the #of
-		 * solutions generated by the cutoff join used when this sample was
-		 * taken).
-		 */
-		public final int limit;
-
-		/**
-		 * The #of binding sets out of the source sample vertex sample which
-		 * were consumed.
-		 */
-		public final int inputCount;
-
-		/**
-		 * The #of binding sets generated before the join was cutoff.
-		 * <p>
-		 * Note: If the outputCount is zero then this is a good indicator that
-		 * there is an error in the query such that the join will not select
-		 * anything. This is not 100%, merely indicative.
-		 */
-		public final long outputCount;
-
-		/**
-		 * The ratio of the #of input samples consumed to the #of output samples
-		 * generated (the join hit ratio or scale factor).
-		 */
-		public final double f;
-
-		/**
-		 * The estimated cardinality of the join.
-		 */
-		public final long estimatedCardinality;
-
-		/**
-		 * Indicates whether the estimate is exact, an upper bound, or a lower
-		 * bound.
-		 * 
-		 * TODO This field should be used to avoid needless re-computation of a
-		 * join whose exact solution is already known.
-		 */
-		public final EstimateEnum estimateEnum;
-		
-		/**
-		 * The sample of the solutions for the join path.
-		 */
-		private final IBindingSet[] sample;
-
-		/**
-		 * Create an object which encapsulates a sample of an edge.
-		 * 
-		 * @param limit
-		 *            The limit used to sample the edge (this is the limit on
-		 *            the #of solutions generated by the cutoff join used when
-		 *            this sample was taken).
-		 * @param sourceVertexSample
-		 *            The sample for source vertex of the edge (whichever vertex
-		 *            has the lower cardinality).
-		 * @param inputCount
-		 *            The #of binding sets out of the source sample vertex
-		 *            sample which were consumed.
-		 * @param outputCount
-		 *            The #of binding sets generated before the join was cutoff.
-		 */
-		EdgeSample(
-				// final VertexSample sourceVertexSample,
-				final long sourceSampleRangeCount,//
-				final boolean sourceSampleExact, //
-				final int sourceSampleLimit,//
-				final int limit,//
-				final int inputCount, //
-				final long outputCount,//
-				final double f,
-				final long estimatedCardinality,
-				final IBindingSet[] sample) {
-
-			if (sample == null)
-				throw new IllegalArgumentException();
-
-			// this.rangeCount = sourceVertexSample.rangeCount;
-			this.rangeCount = sourceSampleRangeCount;
-			
-			this.sourceSampleExact = sourceSampleExact;
-
-			this.limit = limit;
-
-			this.inputCount = inputCount;
-
-			this.outputCount = outputCount;
-
-			this.f = f;
-			
-			this.estimatedCardinality = estimatedCardinality;
-			
-			if (sourceSampleExact && outputCount < limit) {
-				/*
-				 * Note: If the entire source vertex is being fed into the
-				 * cutoff join and the cutoff join outputCount is LT the limit,
-				 * then the sample is the actual result of the join. That is,
-				 * feeding all source solutions into the join gives fewer than
-				 * the desired number of output solutions.
-				 */
-				estimateEnum = EstimateEnum.Exact;
-			} else if (inputCount == 1 && outputCount == limit) {
-				/*
-				 * If the inputCount is ONE (1) and the outputCount is the
-				 * limit, then the estimated cardinality is a lower bound as
-				 * more than outputCount solutions might be produced by the join
-				 * when presented with a single input solution.
-				 */
-				estimateEnum = EstimateEnum.LowerBound;
-			} else if (!sourceSampleExact
-					&& inputCount == Math.min(sourceSampleLimit, rangeCount)
-					&& outputCount == 0) {
-				/*
-				 * When the source sample was not exact, the inputCount is EQ to
-				 * the lesser of the source range count and the source sample
-				 * limit, and the outputCount is ZERO (0), then feeding in all
-				 * source solutions in is not sufficient to generate any output
-				 * solutions. In this case, the estimated join hit ratio appears
-				 * to be zero. However, the estimation of the join hit ratio
-				 * actually underflowed and the real join hit ratio might be a
-				 * small non-negative value. A real zero can only be identified
-				 * by executing the full join.
-				 * 
-				 * Note: An apparent join hit ratio of zero does NOT imply that
-				 * the join will be empty (unless the source vertex sample is
-				 * actually the fully materialized access path - this case is
-				 * covered above).
-				 */
-				estimateEnum = EstimateEnum.Underflow;
-			} else {
-				estimateEnum = EstimateEnum.Normal;
-			}
-
-			this.sample = sample;
-		}
-
-		public String toString() {
-			return getClass().getName() //
-					+ "{ rangeCount=" + rangeCount//
-					+ ", sourceSampleExact=" + sourceSampleExact//
-					+ ", limit=" + limit //
-					+ ", inputCount=" + inputCount//
-					+ ", outputCount=" + outputCount //
-					+ ", f=" + f//
-					+ ", estimatedCardinality=" + estimatedCardinality//
-					+ ", estimateEnum=" + estimateEnum//
-//					+ ", estimateIsLowerBound=" + estimateIsLowerBound//
-//					+ ", estimateIsUpperBound=" + estimateIsUpperBound//
-//					+ ", sampleIsExactSolution=" + estimateIsExact //
-					+ "}";
-		}
-
-	};
-
-	/**
-	 * An edge of the join graph is an annotated join operator. The edges of the
-	 * join graph are undirected. Edges exist when the vertices share at least
-	 * one variable.
-	 * <p>
-	 * {@link #hashCode()} is defined in terms of the unordered hash codes of
-	 * the individual vertices.
-	 */
-	public static class Edge implements Serializable {
-
-		/**
-         * 
-         */
-		private static final long serialVersionUID = 1L;
-
-		/**
-		 * The vertices connected by that edge.
-		 */
-		public final Vertex v1, v2;
-
-		/**
-		 * The set of shared variables.
-		 */
-		public final Set<IVariable<?>> shared;
-
-		/**
-		 * The last sample for this edge and <code>null</code> if the edge has
-		 * not been sampled.
-		 * <p>
-		 * Note: This sample is only the one-step cutoff evaluation of the edge
-		 * given a sample of its vertex having the lesser cardinality. It is NOT
-		 * the cutoff sample of a join path having this edge except for the
-		 * degenerate case where the edge is the first edge in the join path.
-		 */
-		transient EdgeSample sample = null;
-
-		public Edge(final Vertex v1, final Vertex v2,
-				final Set<IVariable<?>> shared) {
-			if (v1 == null)
-				throw new IllegalArgumentException();
-			if (v2 == null)
-				throw new IllegalArgumentException();
-			if (shared == null)
-				throw new IllegalArgumentException();
-			// Note: We need to allow edges which do not share variables
-//			if (shared.isEmpty())
-//				throw new IllegalArgumentException();
-			this.v1 = v1;
-			this.v2 = v2;
-			this.shared = shared;
-		}
-
-		/**
-		 * The edge label is formed from the {@link BOp.Annotations#BOP_ID} of
-		 * its ordered vertices (v1,v2).
-		 */
-		public String getLabel() {
-
-			return "(" + v1.pred.getId() + "," + v2.pred.getId() + ")";
-			
-		}
-		
-		/**
-		 * Note: The vertices of the edge are labeled using the
-		 * {@link BOp.Annotations#BOP_ID} associated with the {@link IPredicate}
-		 * for each vertex.
-		 */
-		public String toString() {
-
-			return "Edge{ "+getLabel()+", estCard="
-					+ (sample == null ? "N/A" : sample.estimatedCardinality)
-					+ ", shared=" + shared.toString() + ", sample=" + sample
-					+ "}";
-
-		}
-
-		/**
-		 * Equality is determined by reference testing.
-		 */
-		public boolean equals(final Object o) {
-
-			return this == o;
-
-		}
-
-		/**
-		 * The hash code of an edge is the hash code of the vertex with the
-		 * smaller hash code X 31 plus the hash code of the vertex with the
-		 * larger hash code. This definition compensates for the arbitrary order
-		 * in which the vertices may be expressed and also recognizes that the
-		 * vertex hash codes are based on the bop ids, which are often small
-		 * integers.
-		 */
-		public int hashCode() {
-
-			if (hash == 0) {
-
-				final int h1 = v1.hashCode();
-				final int h2 = v2.hashCode();
-
-				final int h;
-				if (h1 < h2) {
-
-					h = h1 * 31 + h2;
-
-				} else {
-
-					h = h2 * 31 + h1;
-
-				}
-
-				hash = h;
-
-			}
-			return hash;
-
-		}
-
-		private int hash;
-
-		/**
-		 * Return the vertex with the smaller estimated cardinality.
-		 * 
-		 * @throws IllegalStateException
-		 *             if either vertex has not been sampled.
-		 */
-		public Vertex getMinimumCardinalityVertex() {
-
-			if (v1.sample == null) // vertex not sampled.
-				throw new IllegalStateException();
-
-			if (v2.sample == null) // vertex not sampled.
-				throw new IllegalStateException();
-
-			return (v1.sample.rangeCount < v2.sample.rangeCount) ? v1 : v2;
-
-		}
-
-		/**
-		 * Return the vertex with the larger estimated cardinality (the vertex
-		 * not returned by {@link #getMinimumCardinalityVertex()}).
-		 * 
-		 * @throws IllegalStateException
-		 *             if either vertex has not been sampled.
-		 */
-		public Vertex getMaximumCardinalityVertex() {
-
-			// The vertex with the minimum cardinality.
-			final Vertex o = getMinimumCardinalityVertex();
-
-			// Return the other vertex.
-			return (v1 == o) ? v2 : v1;
-
-		}
-
-		/**
-		 * Estimate the cardinality of the edge, updating {@link #sample} as a
-		 * side-effect. This is a NOP if the edge has already been sampled at
-		 * that <i>limit</i>. This is a NOP if the edge sample is exact.
-		 * 
-		 * @param context
-		 * 
-		 * @return The new {@link EdgeSample} (this is also updated on
-		 *         {@link #sample} as a side-effect).
-		 * 
-		 * @throws Exception
-		 */
-		public EdgeSample estimateCardinality(final QueryEngine queryEngine,
-				final int limit) throws Exception {
-
-			if (limit <= 0)
-				throw new IllegalArgumentException();
-
-//			/*
-//			 * Note: There is never a need to "re-sample" the edge. Unlike ROX,
-//			 * we always can sample a vertex. This means that we can sample the
-//			 * edges exactly once, during the initialization of the join graph.
-//			 */
-//			if (sample != null)
-//				throw new RuntimeException();
-
-			if (sample != null) {
-
-				if (sample.limit >= limit) {
-
-					// Already sampled at that limit.
-					return sample;
-
-				}
-
-				if (sample.estimateEnum == EstimateEnum.Exact) {
-
-					// Sample is exact (fully materialized result).
-					return sample;
-
-				}
-				
-			}
-			
-			/*
-			 * Figure out which vertex has the smaller cardinality. The sample
-			 * of that vertex is used since it is more representative than the
-			 * sample of the other vertex.
-			 */
-			// vertex v, vprime
-			final Vertex v, vp;
-			if (v1.sample == null) // vertex not sampled.
-				throw new IllegalStateException();
-			if (v2.sample == null) // vertex not sampled.
-				throw new IllegalStateException();
-            /*
-             * FIXME CONSTRAINT ORDERING : If a variable only appears in a
-             * CONSTRAINT for one of the two vertices then that vertex must be
-             * evaluated second. (If the vertices both have this problem then
-             * the edge can not be evaluated until some other vertex causes the
-             * variables of either one [v1] or [v2] to become bound.)
-             */
-			if (v1.sample.rangeCount < v2.sample.rangeCount) {
-				v = v1;
-				vp = v2;
-			} else {
-				v = v2;
-				vp = v1;
-			}
-
-			/*
-			 * Convert the source sample into an IBindingSet[].
-			 * 
-			 * TODO We might as well do this when we sample the vertex.
-			 */
-			final IBindingSet[] sourceSample = new IBindingSet[v.sample.sample.length];
-			{
-				for (int i = 0; i < sourceSample.length; i++) {
-					final IBindingSet bset = new HashBindingSet();
-					BOpContext.copyValues((IElement) v.sample.sample[i],
-							v.pred, bset);
-					sourceSample[i] = bset;
-				}
-			}
-
-			// Sample the edge and save the sample on the edge as a side-effect.
-			this.sample = estimateCardinality(queryEngine, limit, v, vp,
-					v.sample.rangeCount, v.sample.exact, v.sample.limit,
-					sourceSample);
-
-			return sample;
-
-		}
-
-		/**
-		 * Estimate the cardinality of the edge given a sample of either a
-		 * vertex or a join path leading up to that edge.
-		 * <p>
-		 * Note: The caller is responsible for protecting against needless
-		 * re-sampling.
-		 * 
-		 * @param queryEngine
-		 * @param limit
-		 * @param vSource
-		 *            The source vertex.
-		 * @param vTarget
-		 *            The target vertex
-		 * @param sourceSample
-		 *            The sample for the source vertex. When this is a one-step
-		 *            estimation of the cardinality of the edge, then this
-		 *            sample is taken from the {@link VertexSample}. When the
-		 *            edge (vSource,vTarget) extends some {@link Path}, then
-		 *            this is taken from the {@link EdgeSample} for that
-		 *            {@link Path}.
-		 * 
-		 * @return The result of sampling that edge.
-		 * 
-		 * @throws Exception
-		 */
-		public EdgeSample estimateCardinality(final QueryEngine queryEngine,
-				final int limit, final Vertex vSource, final Vertex vTarget,
-				final long sourceSampleRangeCount,
-				final boolean sourceSampleExact,
-				final int sourceSampleLimit,
-				final IBindingSet[] sourceSample)
-				throws Exception {
-
-			if (limit <= 0)
-				throw new IllegalArgumentException();
-
-			/*
-			 * Note: This sets up a cutoff pipeline join operator which makes an
-			 * accurate estimate of the #of input solutions consumed and the #of
-			 * output solutions generated. From that, we can directly compute
-			 * the join hit ratio. This approach is preferred to injecting a
-			 * "RowId" column as the estimates are taken based on internal
-			 * counters in the join operator and the join operator knows how to
-			 * cutoff evaluation as soon as the limit is satisfied, thus
-			 * avoiding unnecessary effort.
-			 * 
-			 * TODO Any constraints on the edge (other than those implied by
-			 * shared variables) need to be annotated on the join. Constraints
-			 * (other than range constraints which are directly coded by the
-			 * predicate) will not reduce the effort to compute the join, but
-			 * they can reduce the cardinality of the join and that is what we
-			 * are trying to estimate here.
-			 * 
-			 * TODO How can join constraints be moved around? Just attach them
-			 * where ever a variable becomes bound? And when do we filter out
-			 * variables which are not required downstream? Once we decide on a
-			 * join path and execute it fully (rather than sampling that join
-			 * path).
-			 */
-			final int joinId = 1;
-			final Map<String,Object> anns = NV.asMap(//
-				new NV(BOp.Annotations.BOP_ID, joinId),//
-				// @todo Why not use a factory which avoids bopIds already in use?
-				new NV(PipelineJoin.Annotations.PREDICATE, vTarget.pred.setBOpId(3)),
-				// disallow parallel evaluation of tasks.
-				new NV(PipelineOp.Annotations.MAX_PARALLEL,1),
-				// disallow parallel evaluation of chunks.
-				new NV(PipelineJoin.Annotations.MAX_PARALLEL_CHUNKS,0),
-				// disable access path coalescing 
-				new NV(PipelineJoin.Annotations.COALESCE_DUPLICATE_ACCESS_PATHS,false),
-				// cutoff join.
-				new NV(PipelineJoin.Annotations.LIMIT,(long)limit),
-				/*
-				 * Note: In order to have an accurate estimate of the join
-				 * hit ratio we need to make sure that the join operator
-				 * runs using a single PipelineJoinStats instance which will
-				 * be visible to us when the query is cutoff. In turn, this
-				 * implies that the join must be evaluated on the query
-				 * controller.
-				 * 
-				 * @todo This implies that sampling of scale-out joins must
-				 * be done using remote access paths.
-				 */
-				new NV(PipelineJoin.Annotations.SHARED_STATE,true),
-				new NV(PipelineJoin.Annotations.EVALUATION_CONTEXT,BOpEvaluationContext.CONTROLLER)
-			);
-
-			final PipelineJoin joinOp = new PipelineJoin(new BOp[] {}, anns);
-
-			final PipelineOp queryOp = joinOp;
-			
-			// run the cutoff sampling of the edge.
-			final UUID queryId = UUID.randomUUID();
-			final IRunningQuery runningQuery = queryEngine.eval(queryId,
-					queryOp, new LocalChunkMessage<IBindingSet>(queryEngine,
-							queryId, joinOp.getId()/* startId */,
-							-1 /* partitionId */,
-							new ThickAsynchronousIterator<IBindingSet[]>(
-									new IBindingSet[][] { sourceSample })));
-
-			final List<IBindingSet> result = new LinkedList<IBindingSet>();
-			try {
-				try {
-					IBindingSet bset = null;
-					// Figure out the #of source samples consumed.
-					final Iterator<IBindingSet> itr = new Dechunkerator<IBindingSet>(
-							runningQuery.iterator());
-					while (itr.hasNext()) {
-						bset = itr.next();
-						result.add(bset);
-					}
-				} finally {
-					// verify no problems.
-					runningQuery.get();
-				}
-			} finally {
-				runningQuery.cancel(true/* mayInterruptIfRunning */);
-			}
-
-			// The join hit ratio can be computed directly from these stats.
-			final PipelineJoinStats joinStats = (PipelineJoinStats) runningQuery
-					.getStats().get(joinId);
-
-			if (log.isTraceEnabled())
-				log.trace(joinStats.toString());
-
-			/*
-			 * TODO Improve comments here. See if it is possible to isolate a
-			 * common base class which would simplify the setup of the cutoff
-			 * join and the computation of the sample stats.
-			 */
-
-			// #of solutions in.
-			final int nin = (int) joinStats.inputSolutions.get();
-
-			// #of solutions out.
-			long nout = joinStats.outputSolutions.get();
-
-			// cumulative range count of the sampled access paths.
-			final long sumRangeCount = joinStats.accessPathRangeCount.get();
-
-			if (nin == 1 && nout == limit) {
-				/*
-				 * We are getting [limit] solutions out for one solution in. In
-				 * this case, (nout/nin) is a lower bound for the estimated
-				 * cardinality of the edge. In fact, this condition suggests
-				 * that the upper bound is a must better estimate of the
-				 * cardinality of this join. Therefore, we replace [nout] with
-				 * the sum of the range counts for the as-bound predicates
-				 * considered by the cutoff join.
-				 * 
-				 * For example, consider a join feeding a rangeCount of 16 into
-				 * a rangeCount of 175000. With a limit of 100, we estimated the
-				 * cardinality at 1600L (lower bound). In fact, the cardinality
-				 * is 16*175000. This falsely low estimate can cause solutions
-				 * which are really better to be dropped.
-				 * 
-				 * @todo we should mark [nout] when we do this show that it
-				 * shows up in the trace!  Also, the rangeCount is sometimes
-				 * falsely high. However, that should be corrected by random
-				 * resampling of the vertices and paths.
-				 */
-				nout = sumRangeCount;
-
-			}
-
-			final double f = nout == 0 ? 0 : (nout / (double) nin);
-
-			final long estimatedCardinality = (long) (sourceSampleRangeCount * f);
-			
-			final EdgeSample edgeSample = new EdgeSample(
-					sourceSampleRangeCount, //
-					sourceSampleExact, // @todo redundant with sourceSampleLimit
-					sourceSampleLimit, //
-					limit, //
-					nin,//
-					nout, //
-					f, //
-					estimatedCardinality, //
-					result.toArray(new IBindingSet[result.size()]));
-
-			if (log.isDebugEnabled())
-				log.debug(getLabel() + " : newSample=" + edgeSample);
-
-			return edgeSample;
-
-		}
-
-	}
-
-	/**
-	 * A sequence of {@link Edge}s (aka join steps).
-	 */
-	public static class Path {
-
-		/**
-		 * An immutable ordered list of the edges in the (aka the sequence of
-		 * joins represented by this path).
-		 */
-		public final List<Edge> edges;
-
-		/**
-		 * The sample obtained by the step-wise cutoff evaluation of the ordered
-		 * edges of the path.
-		 * <p>
-		 * Note: This sample is generated one edge at a time rather than by
-		 * attempting the cutoff evaluation of the entire join path (the latter
-		 * approach does allow us to limit the amount of work to be done to
-		 * satisfy the cutoff).
-		 */
-		public EdgeSample sample;
-
-		/**
-		 * The cumulative estimated cardinality of the path. This is zero for an
-		 * empty path. For a path consisting of a single edge, this is the
-		 * estimated cardinality of that edge. When creating a new path by
-		 * adding an edge to an existing path, the cumulative cardinality of the
-		 * new path is the cumulative cardinality of the existing path plus the
-		 * estimated cardinality of the cutoff join of the new edge given the
-		 * input sample of the existing path.
-		 * 
-		 * @todo track this per vertex as well as the total for more interesting
-		 * traces in showPath(Path).
-		 */
-		final public long cumulativeEstimatedCardinality;
-
-		public String toString() {
-			final StringBuilder sb = new StringBuilder();
-			sb.append("Path{");
-			boolean first = true;
-			for (Edge e : edges) {
-				if (!first)
-					sb.append(",");
-				sb.append(e.getLabel());
-				first = false;
-			}
-			sb.append(",cumEstCard=" + cumulativeEstimatedCardinality
-					+ ",sample=" + sample + "}");
-			return sb.toString();
-		}
-
-		/**
-		 * Create an empty path.
-		 */
-		public Path() {
-			this.edges = Collections.emptyList();
-			this.cumulativeEstimatedCardinality = 0;
-			this.sample = null;
-		}
-
-		/**
-		 * Create a path from a single edge.
-		 * 
-		 * @param e
-		 *            The edge.
-		 */
-		public Path(final Edge e) {
-
-			if (e == null)
-				throw new IllegalArgumentException();
-
-			if (e.sample == null)
-				throw new IllegalArgumentException("Not sampled: " + e);
-
-			this.edges = Collections.singletonList(e);
-
-			this.sample = e.sample;
-
-			this.cumulativeEstimatedCardinality = e.sample.estimatedCardinality;
-
-		}
-
-		/**
-		 * Constructor used by {@link #addEdge(QueryEngine, int, Edge)}
-		 * 
-		 * @param edges
-		 *            The edges in the new path.
-		 * @param cumulativeEstimatedCardinality
-		 *            The cumulative estimated cardinality of the new path.
-		 * @param sample
-		 *            The sample from the last
-		 */
-		private Path(final List<Edge> edges,
-				final long cumulativeEstimatedCardinality,
-				final EdgeSample sample) {
-
-			if (edges == null)
-				throw new IllegalArgumentException();
-
-			if (cumulativeEstimatedCardinality < 0)
-				throw new IllegalArgumentException();
-
-			if (sample == null)
-				throw new IllegalArgumentException();
-
-			this.edges = Collections.unmodifiableList(edges);
-
-			this.cumulativeEstimatedCardinality = cumulativeEstimatedCardinality;
-
-			this.sample = sample;
-
-		}
-
-		/**
-		 * Return <code>true</code> iff the {@link Path} contains at least one
-		 * {@link Edge} for that {@link Vertex}.
-		 * 
-		 * @param v
-		 *            The vertex
-		 * 
-		 * @return true if the vertex is already part of the path.
-		 */
-		public boolean contains(final Vertex v) {
-
-			if (v == null)
-				throw new IllegalArgumentException();
-
-			for (Edge e : edges) {
-
-				if (e.v1 == v || e.v2 == v)
-					return true;
-
-			}
-
-			return false;
-		}
-
-		/**
-		 * Return <code>true</code> if this path is an unordered variant of the
-		 * given path (same vertices in any order).
-		 * 
-		 * @param p
-		 *            Another path.
-		 * 
-		 * @return <code>true</code> if this path is an unordered variant of the
-		 *         given path.
-		 */
-		public boolean isUnorderedVariant(final Path p) {
-
-			if (p == null)
-				throw new IllegalArgumentException();
-
-			if (edges.size() != p.edges.size()) {
-				/*
-				 * Fast rejection. This assumes that each edge after the first
-				 * adds one distinct vertex to the path. That assumption is
-				 * enforced by #addEdge().
-				 */
-				return false;
-			}
-
-			final Vertex[] v1 = getVertices();
-			final Vertex[] v2 = p.getVertices();
-
-			if (v1.length != v2.length) {
-
-				// Reject (this case is also covered by the test above).
-				return false;
-				
-			}
-
-			/*
-			 * Scan the vertices of the caller's path. If any of those vertices
-			 * are NOT found in this path the paths are not unordered variations
-			 * of one another.
-			 */
-			for (int i = 0; i < v2.length; i++) {
-
-				final Vertex tmp = v2[i];
-
-				boolean found = false;
-				for (int j = 0; j < v1.length; j++) {
-
-					if (v1[j] == tmp) {
-						found = true;
-						break;
-					}
-
-				}
-
-				if (!found) {
-					return false;
-				}
-
-			}
-
-			return true;
-
-		}
-
-		/**
-		 * Return the vertices in this path (in path order). For the first edge,
-		 * the minimum cardinality vertex is always reported first (this is
-		 * critical for producing the correct join plan). For the remaining
-		 * edges in the path, the unvisited is reported.
-		 * 
-		 * @return The vertices (in path order).
-		 * 
-		 *         TODO This could be rewritten without the toArray() using a
-		 *         method which visits the vertices of a path in any order.
-		 *         
-		 *         @todo unit test for the first vertex to be reported.
-		 */
-		public Vertex[] getVertices() {
-			
-			final Set<Vertex> tmp = new LinkedHashSet<Vertex>();
-
-			for (Edge e : edges) {
-
-				if (tmp.isEmpty()) {
-                    /*
-                     * The first edge is handled specially in order to report
-                     * the minimum cardinality vertex first.
-                     * 
-                     * FIXME CONSTRAINT ORDERING : A vertex can not run until
-                     * all variables appearing in its CONSTRAINTS would be
-                     * bound. This can cause us to use and report an ordering
-                     * which does not place the minimum cardinality vertex 1st.
-                     */
-					tmp.add(e.getMinimumCardinalityVertex());
-					tmp.add(e.getMaximumCardinalityVertex());
-
-				} else {
-
-					tmp.add(e.v1);
-
-					tmp.add(e.v2);
-
-				}
-				
-			}
-			
-			final Vertex[] a = tmp.toArray(new Vertex[tmp.size()]);
-			
-			return a;
-			
-		}
-
-		/**
-		 * Return the {@link IPredicate}s associated with the vertices of the
-		 * join path in path order.
-		 * 
-		 * @see #getVertices()
-		 */
-		public IPredicate<?>[] getPredicates() {
-
-			// The vertices in the selected evaluation order.
-			final Vertex[] vertices = getVertices();
-
-			// The predicates in the same order as the vertices.
-			final IPredicate<?>[] preds = new IPredicate[vertices.length];
-
-			for (int i = 0; i < vertices.length; i++) {
-
-				preds[i] = vertices[i].pred;
-
-			}
-
-			return preds;
-
-		}
-
-		/**
-         * Return the {@link BOp} identifiers of the predicates associated with
-         * each vertex in path order.
-		 */
-		public int[] getVertexIds() {
-		    
-		    return getVertexIds(edges);
-		    
-		}
-		
-		/**
-		 * Return the {@link BOp} identifiers of the predicates associated with
-		 * each vertex in path order.
-		 */
-		static public int[] getVertexIds(final List<Edge> edges) {
-
-			final Set<Vertex> tmp = new LinkedHashSet<Vertex>();
-			
-			for (Edge e : edges) {
-			
-				tmp.add(e.v1);
-				
-				tmp.add(e.v2);
-				
-			}
-			
-			final Vertex[] a = tmp.toArray(new Vertex[tmp.size()]);
-
-			final int[] b = new int[a.length];
-			
-			for (int i = 0; i < a.length; i++) {
-			
-				b[i] = a[i].pred.getId();
-				
-			}
-			
-			return b;
-			
-		}
-
-		/**
-		 * Return <code>true</code> if this path begins with the given path.
-		 * 
-		 * @param p
-		 *            The given path.
-		 * 
-		 * @return <code>true</code> if this path begins with the given path.
-		 */
-		public boolean beginsWith(final Path p) {
-
-			if (p == null)
-				throw new IllegalArgumentException();
-
-			if (p.edges.size() > edges.size()) {
-				// Proven false since the caller's path is lon...
 
[truncated message content]

[Bigdata-commit] SF.net SVN: bigdata:[4218] branches/QUADS_QUERY_BRANCH/bigdata

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4218] branches/QUADS_QUERY_BRANCH/bigdata