|
From: <tho...@us...> - 2010-10-02 01:00:56
|
Revision: 3718
http://bigdata.svn.sourceforge.net/bigdata/?rev=3718&view=rev
Author: thompsonbry
Date: 2010-10-02 01:00:49 +0000 (Sat, 02 Oct 2010)
Log Message:
-----------
Worked through more of the default graph access path decision tree. Only the parallel subquery case is left. I've also simplified the decision tree in the worksheet.
Modified Paths:
--------------
branches/QUADS_QUERY_BRANCH/bigdata/src/architecture/query-cost-model.xls
branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java
branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/filter/DistinctFilter.java
branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/engine/Rule2BOpUtility.java
branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/bop/rdf/filter/StripContextFilter.java
branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestContextAdvancer.java
Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/architecture/query-cost-model.xls
===================================================================
(Binary files differ)
Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java 2010-10-01 20:37:19 UTC (rev 3717)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/BOpUtility.java 2010-10-02 01:00:49 UTC (rev 3718)
@@ -408,7 +408,10 @@
if (!distinct.add(t) && !(t instanceof IVariableOrConstant<?>)) {
/*
* BOp appears more than once. This is only allowed for
- * constants and variables.
+ * constants and variables to reduce the likelihood of operator
+ * trees which describe loops. This will not detect operator
+ * trees whose sinks target a descendant, which is another way
+ * to create a loop.
*/
throw new DuplicateBOpException(t.toString());
}
Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/filter/DistinctFilter.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/filter/DistinctFilter.java 2010-10-01 20:37:19 UTC (rev 3717)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/ap/filter/DistinctFilter.java 2010-10-02 01:00:49 UTC (rev 3718)
@@ -21,6 +21,10 @@
* @version $Id: DistinctElementFilter.java 3466 2010-08-27 14:28:04Z
* thompsonbry $
*
+ * @todo Extract a common interface or metadata for all DISTINCT element filters
+ * (in memory hash map, persistence capable hash map, distributed hash
+ * map).
+ *
* @todo Reconcile with {@link IChunkConverter},
* {@link com.bigdata.striterator.DistinctFilter} (handles solutions) and
* {@link MergeFilter} (handles comparables),
@@ -39,6 +43,13 @@
}
/**
+ * A instance using the default configuration for the in memory hash map.
+ */
+ public static DistinctFilter newInstance() {
+ return new DistinctFilter(NOARGS, NOANNS);
+ }
+
+ /**
* Required deep copy constructor.
*/
public DistinctFilter(final DistinctFilter op) {
Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/engine/Rule2BOpUtility.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/engine/Rule2BOpUtility.java 2010-10-01 20:37:19 UTC (rev 3717)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/bop/engine/Rule2BOpUtility.java 2010-10-02 01:00:49 UTC (rev 3718)
@@ -59,7 +59,9 @@
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.Var;
import com.bigdata.bop.ap.Predicate;
+import com.bigdata.bop.ap.filter.DistinctFilter;
import com.bigdata.bop.bset.StartOp;
+import com.bigdata.bop.fed.FederatedQueryEngine;
import com.bigdata.bop.join.PipelineJoin;
import com.bigdata.bop.rdf.filter.StripContextFilter;
import com.bigdata.bop.rdf.join.DataSetJoin;
@@ -89,6 +91,7 @@
import com.bigdata.relation.rule.eval.DefaultEvaluationPlan2;
import com.bigdata.relation.rule.eval.IRangeCountFactory;
import com.bigdata.relation.rule.eval.RuleState;
+import com.bigdata.service.ResourceService;
import com.bigdata.striterator.IKeyOrder;
/**
@@ -119,6 +122,14 @@
* {@link DefaultGraphSolutionExpander}.
*/
private static boolean enableDecisionTree = false;
+
+ /**
+ * The #of samples to take when comparing the cost of a SCAN with an IN
+ * filter to subquery for each graph in the data set.
+ *
+ * @todo Add query hint to override this default.
+ */
+ private static final int SAMPLE_LIMIT = 100;
/**
* Annotations used by the {@link BigdataEvaluationStrategyImpl} to
@@ -248,8 +259,8 @@
public static PipelineOp convert(final IRule rule, final int startId,
final AbstractTripleStore db, final QueryEngine queryEngine) {
- // true iff the database is in quads mode.
- final boolean isQuadsQuery = db.isQuads();
+// // true iff the database is in quads mode.
+// final boolean isQuadsQuery = db.isQuads();
int bopId = startId;
@@ -508,15 +519,6 @@
final List<NV> anns, Predicate pred, final Dataset dataset,
final org.openrdf.query.algebra.Var cvar) {
- /*
- * @todo raise this into the caller and do one per rule rather than once
- * per access path.
- */
- final DataSetSummary summary = new DataSetSummary(dataset
- .getNamedGraphs());
-
- anns.add(new NV(Annotations.NKNOWN, summary.nknown));
-
final boolean scaleOut = queryEngine.isScaleOut();
if (scaleOut) {
anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
@@ -528,7 +530,19 @@
// true iff C is bound to a constant.
final boolean isCBound = cvar.getValue() != null;
-
+
+ if (dataset == null) {
+
+ /*
+ * The dataset is all graphs. C is left unbound and the unmodified
+ * access path is used.
+ */
+
+ return new PipelineJoin(new BOp[] { left, pred }, anns
+ .toArray(new NV[anns.size()]));
+
+ }
+
if (isCBound) {
/*
@@ -538,8 +552,19 @@
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
- } else if (summary.nknown == 0) {
+ }
+
+ /*
+ * @todo raise this into the caller and do one per rule rather than once
+ * per access path.
+ */
+ final DataSetSummary summary = new DataSetSummary(dataset
+ .getNamedGraphs());
+ anns.add(new NV(Annotations.NKNOWN, summary.nknown));
+
+ if (summary.nknown == 0) {
+
/*
* The data set is empty (no graphs). Return a join backed by an
* empty access path.
@@ -553,133 +578,96 @@
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
- } else if (summary.nknown == 1) {
+ }
+ if (summary.nknown == 1) {
+
/*
* The dataset contains exactly one graph. Bind C.
*/
-
+
pred = pred.asBound((IVariable) pred.get(3), new Constant(
summary.firstContext));
-
+
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
- } else if (dataset == null) {
+ }
+ /*
+ * Estimate cost of SCAN with C unbound.
+ */
+ final double scanCost = queryEngine.estimateCost(context, pred);
+ anns.add(new NV(Annotations.COST_SCAN, scanCost));
+
+ // Estimate cost of SUBQUERY with C bound (sampling).
+ final double subqueryCost = summary.estimateSubqueryCost(queryEngine,
+ context, SAMPLE_LIMIT, pred, anns);
+
+ if (scanCost < subqueryCost) {
+
/*
- * The dataset is all graphs. C is left unbound and the unmodified
- * access path is used.
+ * Scan and filter. C is left unbound. We do a range scan on the
+ * index and filter using an IN constraint.
*/
+ // IN filter for the named graphs.
+ final IElementFilter<ISPO> test = new InGraphHashSetFilter<ISPO>(
+ summary.nknown, summary.graphs);
+
+ // layer filter onto the predicate.
+ pred = pred.addIndexLocalFilter(ElementFilter.newInstance(test));
+
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
} else {
/*
- * Estimate cost of SCAN with C unbound.
+ * Parallel Subquery.
*/
- final double scanCost = queryEngine.estimateCost(context, pred);
- anns.add(new NV(Annotations.COST_SCAN, scanCost));
-
/*
- * Estimate cost of SUBQUERY with C bound (sampling).
+ * Setup the data set join.
*
- * @todo This should randomly sample in case there is bias in the
- * order in which the URIs are presented here. However, the only
- * thing which would be likely to create a strong bias is if someone
- * sorted them on the IVs or if the URIs were in the same ordering
- * in which their IVs were assigned AND the data were somehow
- * correlated with that order. I rate the latter as pretty unlikely
- * and the former is not true, so this sampling approach should be
- * pretty good.
+ * @todo When the #of named graphs is large we need to do something
+ * special to avoid sending huge graph sets around with the query.
+ * For example, we should create named data sets and join against
+ * them rather than having an in-memory DataSetJoin.
*
- * @todo parameter for the #of samples to take.
+ * @todo The historical approach performed parallel subquery using
+ * an expander pattern rather than a data set join. The data set
+ * join should have very much the same effect, but it may need to
+ * emit multiple chunks to have good parallelism.
*/
- double subqueryCost = 0d;
- final int limit = 100;
- int nsamples = 0;
- for (URI uri : summary.graphs) {
- if (nsamples == limit)
- break;
- final IV graph = ((BigdataURI) uri).getIV();
- subqueryCost += queryEngine.estimateCost(context, pred.asBound(
- (IVariable) pred.get(3), new Constant(graph)));
- nsamples++;
- }
- subqueryCost /= nsamples;
- anns.add(new NV(Annotations.COST_SUBQUERY, subqueryCost));
- anns.add(new NV(Annotations.COST_SUBQUERY_SAMPLE_COUNT, nsamples));
+ // The variable to be bound.
+ final IVariable var = (IVariable) pred.get(3);
- if (scanCost < subqueryCost * summary.nknown) {
+ // The data set join.
+ final DataSetJoin dataSetJoin = new DataSetJoin(new BOp[] { var },
+ NV.asMap(new NV[] {
+ new NV(DataSetJoin.Annotations.VAR, var),
+ new NV(DataSetJoin.Annotations.GRAPHS, summary
+ .getGraphs()) }));
- /*
- * Scan and filter. C is left unbound. We do a range scan on the
- * index and filter using an IN constraint.
- */
-
- // IN filter for the named graphs.
- final IElementFilter<ISPO> test = new InGraphHashSetFilter<ISPO>(
- summary.nknown, summary.graphs);
-
- // layer filter onto the predicate.
- pred = pred
- .addIndexLocalFilter(ElementFilter.newInstance(test));
-
- return new PipelineJoin(new BOp[] { left, pred }, anns
- .toArray(new NV[anns.size()]));
-
+ if (scaleOut) {
+ anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
+ BOpEvaluationContext.SHARDED));
+ anns
+ .add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
+ false));
} else {
+ anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
+ BOpEvaluationContext.ANY));
+ anns
+ .add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
+ false));
+ }
- /*
- * Parallel Subquery.
- */
+ return new PipelineJoin(new BOp[] { left, pred }, anns
+ .toArray(new NV[anns.size()]));
- /*
- * Setup the data set join.
- *
- * @todo When the #of named graphs is large we need to do
- * something special to avoid sending huge graph sets around
- * with the query. For example, we should create named data sets
- * and join against them rather than having an in-memory
- * DataSetJoin.
- *
- * @todo The historical approach performed parallel subquery
- * using an expander pattern rather than a data set join. The
- * data set join should have very much the same effect, but it
- * may need to emit multiple chunks to have good parallelism.
- */
-
- // The variable to be bound.
- final IVariable var = (IVariable) pred.get(3);
-
- // The data set join.
- final DataSetJoin dataSetJoin = new DataSetJoin(
- new BOp[] { var }, NV.asMap(new NV[] {
- new NV(DataSetJoin.Annotations.VAR, var),
- new NV(DataSetJoin.Annotations.GRAPHS, summary
- .getGraphs()) }));
-
- if (scaleOut) {
- anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
- BOpEvaluationContext.SHARDED));
- anns.add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
- false));
- } else {
- anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
- BOpEvaluationContext.ANY));
- anns.add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
- false));
- }
-
- return new PipelineJoin(new BOp[] { left, pred }, anns
- .toArray(new NV[anns.size()]));
-
- }
-
}
}
@@ -692,10 +680,6 @@
* @param anns
* @param pred
* @return
- *
- * @todo The default graph remote access path query estimates do not take
- * RMI costs into account. This is Ok since we are only comparing
- * remote access paths with other remote access paths.
*/
private static PipelineOp defaultGraphJoin(final QueryEngine queryEngine,
final BOpContextBase context, final PipelineOp left,
@@ -706,15 +690,12 @@
* @todo raise this into the caller and do one per rule rather than once
* per access path.
*/
- final DataSetSummary summary = new DataSetSummary(dataset
- .getDefaultGraphs());
+ final DataSetSummary summary = dataset == null ? null
+ : new DataSetSummary(dataset.getDefaultGraphs());
- // true iff C is bound to a constant.
- final boolean isCBound = cvar.getValue() != null;
-
final boolean scaleOut = queryEngine.isScaleOut();
- if (summary.nknown == 0) {
+ if (dataset != null && summary.nknown == 0) {
/*
* The data set is empty (no graphs). Return a join backed by an
@@ -729,7 +710,9 @@
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
- } else if (summary.nknown == 1) {
+ }
+
+ if (dataset != null && summary.nknown == 1) {
/*
* The dataset contains exactly one graph. Bind C. Add a filter to
@@ -740,12 +723,14 @@
pred = pred.asBound((IVariable) pred.get(3), new Constant(
summary.firstContext));
- pred = pred.addAccessPathFilter(StripContextFilter.INSTANCE);
+ pred = pred.addAccessPathFilter(StripContextFilter.newInstance());
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
- } else if (pred.getKeyOrder().getIndexName().endsWith("C")) {
+ }
+
+ if (pred.getKeyOrder().getIndexName().endsWith("C")) {
/*
* C is not bound. An advancer is imposed on the AP to skip to the
@@ -764,7 +749,7 @@
pred = pred.addIndexLocalFilter(new ContextAdvancer());
// Filter to strip off the context position.
- pred = pred.addAccessPathFilter(StripContextFilter.INSTANCE);
+ pred = pred.addAccessPathFilter(StripContextFilter.newInstance());
if(scaleOut) {
@@ -802,129 +787,115 @@
}
}
+
return new PipelineJoin(new BOp[] { left, pred }, anns
.toArray(new NV[anns.size()]));
+
+ }
+
+ // Estimate cost of SCAN with C unbound.
+ final double scanCost = queryEngine.estimateCost(context, pred);
+ anns.add(new NV(Annotations.COST_SCAN, scanCost));
+
+ /*
+ * Estimate cost of SUBQUERY with C bound (sampling). A large value is
+ * used if the dataset is null since the default graph query will run
+ * against all contexts and we are better off doing a SCAN.
+ */
+ final double subqueryCost = dataset == null ? Double.MAX_VALUE
+ : summary.estimateSubqueryCost(queryEngine, context,
+ SAMPLE_LIMIT, pred, anns);
+
+ if (scanCost < subqueryCost) {
+
+ /*
+ * SCAN AND FILTER. C is not bound. Unless all graphs are used,
+ * layer IN filter on the AP to select for the desired graphs. Layer
+ * a filter on the AP to strip off the context position. Layer a
+ * DISTINCT filter on top of that.
+ */
+
+ if (dataset != null) {
+
+ // IN filter for the named graphs.
+ final IElementFilter<ISPO> test = new InGraphHashSetFilter<ISPO>(
+ summary.nknown, summary.graphs);
+
+ // layer filter onto the predicate.
+ pred = pred
+ .addIndexLocalFilter(ElementFilter.newInstance(test));
+
+ }
+
+ // Filter to strip off the context position.
+ pred = pred.addAccessPathFilter(StripContextFilter.newInstance());
+
+ // Filter for distinct SPOs.
+ pred = pred.addAccessPathFilter(DistinctFilter.newInstance());
+
+ return new PipelineJoin(new BOp[] { left, pred }, anns
+ .toArray(new NV[anns.size()]));
+
+ } else {
+
+ /*
+ * PARALLEL SUBQUERY. Bind each value of C in turn, issuing parallel
+ * subqueries against the asBound access paths using an expander
+ * pattern and layer on a filter to strip off the context position.
+ * The asBound access paths write on a shared buffer. That shared
+ * buffer is read from by the expander.
+ *
+ * Scale-out: JOIN is ANY or HASHED. AP is REMOTE.
+ *
+ * FIXME This needs to be implemented based on an expander pattern
+ * which we can capture from DefaultGraphExpander.
+ */
+
+ throw new UnsupportedOperationException();
- }
- // FIXME Finish the default graph decision tree.
- throw new UnsupportedOperationException();
-// } else if (dataset == null) {
-//
// /*
-// * The dataset is all graphs. C is left unbound and the unmodified
-// * access path is used.
-// */
-//
-// return new PipelineJoin(new BOp[] { left, pred }, anns
-// .toArray(new NV[anns.size()]));
-//
-// } else {
-//
-// /*
-// * Estimate cost of SCAN with C unbound.
-// */
-// final double scanCost = queryEngine.estimateCost(context, pred);
-//
-// anns.add(new NV(Annotations.COST_SCAN, scanCost));
-//
-// /*
-// * Estimate cost of SUBQUERY with C bound (sampling).
+// * Setup the data set join.
// *
-// * @todo This should randomly sample in case there is bias in the
-// * order in which the URIs are presented here. However, the only
-// * thing which would be likely to create a strong bias is if someone
-// * sorted them on the IVs or if the URIs were in the same ordering
-// * in which their IVs were assigned AND the data were somehow
-// * correlated with that order. I rate the latter as pretty unlikely
-// * and the former is not true, so this sampling approach should be
-// * pretty good.
+// * @todo When the #of named graphs is large we need to do something
+// * special to avoid sending huge graph sets around with the query.
+// * For example, we should create named data sets and join against
+// * them rather than having an in-memory DataSetJoin.
// *
-// * @todo parameter for the #of samples to take.
+// * @todo The historical approach performed parallel subquery using
+// * an expander pattern rather than a data set join. The data set
+// * join should have very much the same effect, but it may need to
+// * emit multiple chunks to have good parallelism.
// */
-// double subqueryCost = 0d;
-// final int limit = 100;
-// int nsamples = 0;
-// for (URI uri : summary.graphs) {
-// if (nsamples == limit)
-// break;
-// final IV graph = ((BigdataURI) uri).getIV();
-// subqueryCost += queryEngine.estimateCost(context, pred.asBound(
-// (IVariable) pred.get(3), new Constant(graph)));
-// nsamples++;
-// }
-// subqueryCost /= nsamples;
//
-// anns.add(new NV(Annotations.COST_SUBQUERY, subqueryCost));
-// anns.add(new NV(Annotations.COST_SUBQUERY_SAMPLE_COUNT, nsamples));
+// // The variable to be bound.
+// final IVariable var = (IVariable) pred.get(3);
//
-// if (scanCost < subqueryCost * summary.nknown) {
+// // The data set join.
+// final DataSetJoin dataSetJoin = new DataSetJoin(new BOp[] { var },
+// NV.asMap(new NV[] {
+// new NV(DataSetJoin.Annotations.VAR, var),
+// new NV(DataSetJoin.Annotations.GRAPHS, summary
+// .getGraphs()) }));
//
-// /*
-// * Scan and filter. C is left unbound. We do a range scan on the
-// * index and filter using an IN constraint.
-// */
-//
-// // IN filter for the named graphs.
-// final IElementFilter<ISPO> test = new InGraphHashSetFilter<ISPO>(
-// summary.nknown, summary.graphs);
-//
-// // layer filter onto the predicate.
-// pred = pred
-// .addIndexLocalFilter(ElementFilter.newInstance(test));
-//
-// return new PipelineJoin(new BOp[] { left, pred }, anns
-// .toArray(new NV[anns.size()]));
-//
+// if (scaleOut) {
+// anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
+// BOpEvaluationContext.SHARDED));
+// anns
+// .add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
+// false));
// } else {
-//
-// /*
-// * Parallel Subquery.
-// */
-//
-// /*
-// * Setup the data set join.
-// *
-// * @todo When the #of named graphs is large we need to do
-// * something special to avoid sending huge graph sets around
-// * with the query. For example, we should create named data sets
-// * and join against them rather than having an in-memory
-// * DataSetJoin.
-// *
-// * @todo The historical approach performed parallel subquery
-// * using an expander pattern rather than a data set join. The
-// * data set join should have very much the same effect, but it
-// * may need to emit multiple chunks to have good parallelism.
-// */
-//
-// // The variable to be bound.
-// final IVariable var = (IVariable) pred.get(3);
-//
-// // The data set join.
-// final DataSetJoin dataSetJoin = new DataSetJoin(
-// new BOp[] { var }, NV.asMap(new NV[] {
-// new NV(DataSetJoin.Annotations.VAR, var),
-// new NV(DataSetJoin.Annotations.GRAPHS, summary
-// .getGraphs()) }));
-//
-// if (scaleOut) {
-// anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
-// BOpEvaluationContext.SHARDED));
-// anns.add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
-// false));
-// } else {
-// anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
-// BOpEvaluationContext.ANY));
-// anns.add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
-// false));
-// }
-//
-// return new PipelineJoin(new BOp[] { left, pred }, anns
-// .toArray(new NV[anns.size()]));
-//
+// anns.add(new NV(Predicate.Annotations.EVALUATION_CONTEXT,
+// BOpEvaluationContext.ANY));
+// anns
+// .add(new NV(Predicate.Annotations.REMOTE_ACCESS_PATH,
+// false));
// }
//
-// }
+// return new PipelineJoin(new BOp[] { left, pred }, anns
+// .toArray(new NV[anns.size()]));
+ }
+
}
/**
@@ -1113,7 +1084,61 @@
return a;
}
-
+
+ /**
+ * Estimate cost of SUBQUERY with C bound (sampling).
+ *
+ * @param queryEngine
+ * @param context
+ * @param limit
+ * The maximum #of samples to take.
+ * @param pred
+ * The predicate.
+ * @param anns
+ * A list of annotations to which the cost estimate data will
+ * be added.
+ *
+ * @return The estimated cost. This is adjusted based on the sample size
+ * and the #of graphs against which the query was issued and
+ * represents the total expected cost of the subqueries against
+ * all of the graphs in the {@link Dataset}.
+ *
+ * @todo Subquery will be less efficient than a scan when the access
+ * path is remote since there will be remote requests. This model
+ * does not capture that additional overhead. We need to measure
+ * the overhead using appropriate data sets and queries and then
+ * build it into the model. The overhead itself could be changed
+ * dramatically by optimizations in the
+ * {@link FederatedQueryEngine} and the {@link ResourceService}.
+ *
+ * @todo This should randomly sample in case there is bias in the order
+ * in which the URIs are presented here. However, the only thing
+ * which would be likely to create a strong bias is if someone
+ * sorted them on the IVs or if the URIs were in the same ordering
+ * in which their IVs were assigned AND the data were somehow
+ * correlated with that order. I rate the latter as pretty
+ * unlikely and the former is not true, so this sampling approach
+ * should be pretty good.
+ */
+ public double estimateSubqueryCost(QueryEngine queryEngine,
+ BOpContextBase context, int limit, Predicate pred, List<NV> anns) {
+ double subqueryCost = 0d;
+ int nsamples = 0;
+ for (URI uri : graphs) {
+ if (nsamples == limit)
+ break;
+ final IV graph = ((BigdataURI) uri).getIV();
+ subqueryCost += queryEngine.estimateCost(context, pred.asBound(
+ (IVariable) pred.get(3), new Constant(graph)));
+ nsamples++;
+ }
+ subqueryCost = (subqueryCost * nknown) / nsamples;
+
+ anns.add(new NV(Annotations.COST_SUBQUERY, subqueryCost));
+ anns.add(new NV(Annotations.COST_SUBQUERY_SAMPLE_COUNT, nsamples));
+ return subqueryCost;
+ }
+
} // DataSetSummary
/**
Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/bop/rdf/filter/StripContextFilter.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/bop/rdf/filter/StripContextFilter.java 2010-10-01 20:37:19 UTC (rev 3717)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/bop/rdf/filter/StripContextFilter.java 2010-10-02 01:00:49 UTC (rev 3718)
@@ -52,10 +52,11 @@
private static final long serialVersionUID = 1L;
/**
- * A global instance.
+ * A default instance.
*/
- public static final transient StripContextFilter INSTANCE = new StripContextFilter(
- BOpBase.NOARGS, BOpBase.NOANNS);
+ public static StripContextFilter newInstance() {
+ return new StripContextFilter(BOpBase.NOARGS, BOpBase.NOANNS);
+ }
/**
* @param op
Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestContextAdvancer.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestContextAdvancer.java 2010-10-01 20:37:19 UTC (rev 3717)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/rules/TestContextAdvancer.java 2010-10-02 01:00:49 UTC (rev 3718)
@@ -180,7 +180,8 @@
pred = pred.addIndexLocalFilter(new ContextAdvancer());
- pred = pred.addAccessPathFilter(StripContextFilter.INSTANCE);
+ pred = pred.addAccessPathFilter(StripContextFilter
+ .newInstance());
final IAccessPath<ISPO> ap = context.getAccessPath(db
.getSPORelation(), pred);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|