From: <tho...@us...> - 2013-04-26 20:07:21
|
Revision: 7087 http://bigdata.svn.sourceforge.net/bigdata/?rev=7087&view=rev Author: thompsonbry Date: 2013-04-26 20:07:14 +0000 (Fri, 26 Apr 2013) Log Message: ----------- The commit above broke one unit test. If we retain this feature, then the test needs to be modified to predict the new annotation. {{{ TestASTSubGroupJoinVarOptimizer.test_govtrack_21 }}} ---- I refactored the JVMDistinctBindingSets operator to extract its DISTINCT implementation as a JVMDistinctFilter. I added support into the JVMHashJoinUtility implementation for this. The feature MUST be enabled by hand in JVMHashJoinUtility.outputSolutions(). set distinct:=true. When this feature is NOT enabled the following tests fail (these are known failures and documented in the test case). {{{ TestASTSparql11SubqueryOptimizer#test_subSelectWithNoJoinVars() TestASTHashJoinOptimizer#test_hashJoinOptimizer_BSBM_Q5() TestTCK#test_sparql11_order_02() TestTCK#test_sparql11_order_03() }}} If you enable the DISTINCT on the solutions flowing into the child join group, then the following unit tests fail. I suspect that these failures are related to underproducing solutions, but some of them might also be related to a failure to produce the correct set of variables for [projectedInVars]. {{{ TestNamedGraphs#test_default_graph_joins_01f() TestOptionals#test_optionals_simplest() TestOptionals#test_optionals_simplestWithFilter() TestOptionals#test_double_optional_include() TestNegation#test_sparql11_minus_01() TestTCK#test_open_eq_12() }}} ---- Note: We need to add unit tests (SPARQL and HashJoinUtility) for this feature. Note: We need to add unit tests for DISTINCT when some of the variables on which the DISTINCT is imposed are null. ---- @see https://sourceforge.net/apps/trac/bigdata/ticket/668 (JoinGroup optimizations) Modified Paths: -------------- branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/join/JVMHashJoinUtility.java branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctBindingSetsOp.java Added Paths: ----------- branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctFilter.java Modified: branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/join/JVMHashJoinUtility.java =================================================================== --- branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/join/JVMHashJoinUtility.java 2013-04-26 18:36:07 UTC (rev 7086) +++ branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/join/JVMHashJoinUtility.java 2013-04-26 20:07:14 UTC (rev 7087) @@ -49,6 +49,7 @@ import com.bigdata.bop.IVariable; import com.bigdata.bop.PipelineOp; import com.bigdata.bop.engine.BOpStats; +import com.bigdata.bop.solutions.JVMDistinctFilter; import com.bigdata.counters.CAT; import com.bigdata.htree.HTree; import com.bigdata.rdf.internal.impl.literal.XSDBooleanIV; @@ -394,6 +395,14 @@ private final AtomicBoolean open = new AtomicBoolean(true); /** + * The operator whose annotations are used to initialize this object. + * <p> + * Note: This was added to support the DISTINCT FILTER in + * {@link #outputSolutions(IBuffer)}. + */ + private final PipelineOp op; + + /** * The type of join to be performed. */ private final JoinTypeEnum joinType; @@ -419,12 +428,22 @@ private final IVariable<?>[] joinVars; /** - * The variables to be retained (optional, all variables are retained if - * not specified). + * The variables to be retained (aka projected out) (optional, all variables + * are retained if not specified). */ private final IVariable<?>[] selectVars; /** + * The variables to be projected into a join group. When non- + * <code>null</code> variables that are NOT in this array are NOT flowed + * into the join group. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/668" > + * JoinGroup optimizations </a> + */ + private final IVariable<?>[] projectedInVars; + + /** * The join constraints (optional). */ private final IConstraint[] constraints; @@ -515,6 +534,7 @@ if(joinType == null) throw new IllegalArgumentException(); + this.op = op; this.joinType = joinType; this.optional = joinType == JoinTypeEnum.Optional; this.filter = joinType == JoinTypeEnum.Filter; @@ -527,11 +547,19 @@ this.joinVars = (IVariable<?>[]) op .getRequiredProperty(HashJoinAnnotations.JOIN_VARS); - // The projected variables (optional and equal to the join variables iff - // this is a DISTINCT filter). + /* + * The projected OUT variables (optional and equal to the join variables + * iff this is a DISTINCT filter). + */ this.selectVars = filter ? joinVars : (IVariable<?>[]) op .getProperty(JoinAnnotations.SELECT); + /* + * The variables that are projected IN to the join group. + */ + this.projectedInVars = (IVariable<?>[]) op + .getProperty(HashJoinAnnotations.PROJECT_IN_VARS); + // The join constraints (optional). this.constraints = (IConstraint[]) op .getProperty(JoinAnnotations.CONSTRAINTS); @@ -557,7 +585,8 @@ * keyword. This would give us what amounts to per-hash code striped * locks. Note: the JVMDistinctBindingSetsOp does not use this class * right now because it enjoys better concurrency than the - * JVMHashJoinUtility. + * JVMHashJoinUtility. Also see JVMDistinctFilter, which is the backing + * implementation for the JVMDistinctBindingSetsOp. */ rightSolutionsRef.set(new LinkedHashMap<Key, Bucket>(// op.getProperty(HashMapAnnotations.INITIAL_CAPACITY, @@ -682,8 +711,8 @@ } @Override - public long filterSolutions(ICloseableIterator<IBindingSet[]> itr, - BOpStats stats, IBuffer<IBindingSet> sink) { + public long filterSolutions(final ICloseableIterator<IBindingSet[]> itr, + final BOpStats stats, final IBuffer<IBindingSet> sink) { try { @@ -1085,6 +1114,47 @@ @Override public void outputSolutions(final IBuffer<IBindingSet> out) { + /* + * FIXME Set this to enable "DISTINCT" on the solutions flowing into the + * join group. + * + * Note: This should be set by the HashIndexOp (or passed in through the + * interface). + * + * Note: Enabling this causes failures. See the ticket below. I suspect + * that these failures are related to underproducing solutions, but some + * of them might also be related to a failure to produce the correct set + * of variables for [projectedInVars]. + * + * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/668" > + * JoinGroup optimizations </a> + */ + final boolean distinct = false; + + final JVMDistinctFilter distinctFilter; + + if (distinct && projectedInVars != null && projectedInVars.length > 0) { + + /* + * Note: We are single threaded here so we can use a lower + * concurrencyLevel value. + */ + final int concurrencyLevel = 1;//ConcurrentHashMapAnnotations.DEFAULT_CONCURRENCY_LEVEL; + + distinctFilter = new JVMDistinctFilter(projectedInVars, // + op.getProperty(HashMapAnnotations.INITIAL_CAPACITY, + HashMapAnnotations.DEFAULT_INITIAL_CAPACITY),// + op.getProperty(HashMapAnnotations.LOAD_FACTOR, + HashMapAnnotations.DEFAULT_LOAD_FACTOR),// + concurrencyLevel + ); + + } else { + + distinctFilter = null; + + } + try { // if (true) { @@ -1151,6 +1221,26 @@ IBindingSet bs = solutionHit.solution; + if (distinctFilter != null) { + + /* + * Note: The DISTINCT filter is based on the + * variables that are projected INTO the child + * join group. However, those are NOT always + * the same as the variables that are projected + * OUT of the child join group, so we need to + * + */ + + if (distinctFilter.accept2(bs) == null) { + + // Drop duplicate solutions. + continue; + + } + + } + if (selected != null) { // Drop variables which are not projected. Modified: branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctBindingSetsOp.java =================================================================== --- branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctBindingSetsOp.java 2013-04-26 18:36:07 UTC (rev 7086) +++ branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctBindingSetsOp.java 2013-04-26 20:07:14 UTC (rev 7087) @@ -1,26 +1,43 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2010. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ package com.bigdata.bop.solutions; -import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.FutureTask; -import org.apache.log4j.Logger; - import com.bigdata.bop.BOp; import com.bigdata.bop.BOpContext; import com.bigdata.bop.ConcurrentHashMapAnnotations; import com.bigdata.bop.IBindingSet; -import com.bigdata.bop.IConstant; import com.bigdata.bop.IQueryAttributes; import com.bigdata.bop.IVariable; import com.bigdata.bop.NV; import com.bigdata.bop.PipelineOp; -import com.bigdata.bop.bindingSet.ListBindingSet; import com.bigdata.bop.engine.BOpStats; import com.bigdata.bop.join.JVMHashJoinUtility; import com.bigdata.relation.accesspath.IBlockingBuffer; @@ -31,22 +48,18 @@ * <p> * Note: This implementation is a pipelined operator which inspects each chunk * of solutions as they arrive and those solutions which are distinct for each - * chunk are passed on. It uses a {@link ConcurrentMap} and is thread-safe. + * chunk are passed on. It uses a {@link ConcurrentMap} and is thread-safe. It + * is significantly faster than the single-threaded hash index routines in the + * {@link JVMHashJoinUtility}. * - * TODO Look into reconciling this class with {@link JVMHashJoinUtility}. - * However, note that this implementation is thread-safe and uses a - * {@link ConcurrentMap}. It may be better to leave things as they are since - * this implementation may be more efficient for the special case which it - * handles. - * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> * @version $Id: DistinctElementFilter.java 3466 2010-08-27 14:28:04Z * thompsonbry $ */ public class JVMDistinctBindingSetsOp extends PipelineOp { - private final static transient Logger log = Logger - .getLogger(JVMDistinctBindingSetsOp.class); +// private final static transient Logger log = Logger +// .getLogger(JVMDistinctBindingSetsOp.class); /** * @@ -148,81 +161,21 @@ } /** - * Wrapper used for the as bound solutions in the {@link ConcurrentHashMap}. - */ - private static class Solution { - - private final int hash; - - private final IConstant<?>[] vals; - - public Solution(final IConstant<?>[] vals) { - this.vals = vals; - this.hash = java.util.Arrays.hashCode(vals); - } - - public int hashCode() { - return hash; - } - - public boolean equals(final Object o) { - if (this == o) - return true; - if (!(o instanceof Solution)) { - return false; - } - final Solution t = (Solution) o; - if (vals.length != t.vals.length) - return false; - for (int i = 0; i < vals.length; i++) { - // @todo verify that this allows for nulls with a unit test. - if (vals[i] == t.vals[i]) - continue; - if (vals[i] == null) - return false; - if (!vals[i].equals(t.vals[i])) - return false; - } - return true; - } - } - - /** * Task executing on the node. */ static private class DistinctTask implements Callable<Void> { private final BOpContext<IBindingSet> context; - /** - * A concurrent map whose keys are the bindings on the specified - * variables (the keys and the values are the same since the map - * implementation does not allow <code>null</code> values). - * <p> - * Note: The map is shared state and can not be discarded or cleared - * until the last invocation!!! - */ - private final ConcurrentHashMap<Solution, Solution> map; - - /** - * The variables used to impose a distinct constraint. - */ - private final IVariable<?>[] vars; + private final JVMDistinctFilter filter; - @SuppressWarnings("unchecked") DistinctTask(final JVMDistinctBindingSetsOp op, final BOpContext<IBindingSet> context) { this.context = context; - this.vars = op.getVariables(); + final IVariable<?>[] vars = op.getVariables(); - if (vars == null) - throw new IllegalArgumentException(); - - if (vars.length == 0) - throw new IllegalArgumentException(); - /* * The map is shared state across invocations of this operator task. */ @@ -232,68 +185,28 @@ final IQueryAttributes attribs = context.getRunningQuery() .getAttributes(); - ConcurrentHashMap<Solution, Solution> map = (ConcurrentHashMap<Solution, Solution>) attribs - .get(key); + JVMDistinctFilter filter = (JVMDistinctFilter) attribs.get(key); - if (map == null) { + if (filter == null) { - map = new ConcurrentHashMap<Solution, Solution>( + filter = new JVMDistinctFilter(vars, op.getInitialCapacity(), op.getLoadFactor(), op.getConcurrencyLevel()); - final ConcurrentHashMap<Solution, Solution> tmp = (ConcurrentHashMap<Solution, Solution>) attribs - .putIfAbsent(key, map); + final JVMDistinctFilter tmp = (JVMDistinctFilter) attribs + .putIfAbsent(key, filter); if (tmp != null) - map = tmp; + filter = tmp; } - this.map = map; + this.filter = filter; } } - /** - * If the bindings are distinct for the configured variables then return - * those bindings. - * - * @param bset - * The binding set to be filtered. - * - * @return The distinct as bound values -or- <code>null</code> if the - * binding set duplicates a solution which was already accepted. - */ - private IConstant<?>[] accept(final IBindingSet bset) { - - final IConstant<?>[] r = new IConstant<?>[vars.length]; - - for (int i = 0; i < vars.length; i++) { - - /* - * Note: This allows null's. - * - * @todo write a unit test when some variables are not bound. - */ - r[i] = bset.get(vars[i]); - - } - - final Solution s = new Solution(r); - - if (log.isTraceEnabled()) - log.trace("considering: " + Arrays.toString(r)); - - final boolean distinct = map.putIfAbsent(s, s) == null; - - if (distinct && log.isDebugEnabled()) - log.debug("accepted: " + Arrays.toString(r)); - - return distinct ? r : null; - - } - public Void call() throws Exception { final BOpStats stats = context.getStats(); @@ -323,27 +236,14 @@ * Test to see if this solution is distinct from those * already seen. */ - final IConstant<?>[] vals = accept(bset); + final IBindingSet tmp = filter.accept2(bset); - if (vals != null) { + if (tmp != null) { - /* - * This is a distinct solution. Copy only the - * variables used to select distinct solutions into - * a new binding set and add that to the set of - * [accepted] binding sets which will be emitted by - * this operator. - */ - - final ListBindingSet tmp = new ListBindingSet(); - - for (int i = 0; i < vars.length; i++) { - - if (vals[i] != null) - tmp.set(vars[i], vals[i]); - - } - + /* + * This is a distinct solution. + */ + accepted.add(tmp); naccepted++; @@ -391,7 +291,7 @@ * is not going to be cleared until the query goes out of * scope and is swept by GC. */ - map.clear(); + filter.clear(); } Added: branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctFilter.java =================================================================== --- branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctFilter.java (rev 0) +++ branches/BIGDATA_RELEASE_1_2_0/bigdata/src/java/com/bigdata/bop/solutions/JVMDistinctFilter.java 2013-04-26 20:07:14 UTC (rev 7087) @@ -0,0 +1,224 @@ +/** + +Copyright (C) SYSTAP, LLC 2006-2010. All rights reserved. + +Contact: + SYSTAP, LLC + 4501 Tower Road + Greensboro, NC 27410 + lic...@bi... + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Created on Apr 26, 2013 + */ +package com.bigdata.bop.solutions; + +import java.util.Arrays; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.log4j.Logger; + +import com.bigdata.bop.IBindingSet; +import com.bigdata.bop.IConstant; +import com.bigdata.bop.IVariable; +import com.bigdata.bop.bindingSet.ListBindingSet; + +/** + * Utility class for imposing a DISTINCT filter on {@link IBindingSet}. This + * class is thread-safe. It is based on a {@link ConcurrentHashMap}. + * + * @author <a href="mailto:tho...@us...">Bryan Thompson</a> + */ +public class JVMDistinctFilter { + + private static final Logger log = Logger.getLogger(JVMDistinctFilter.class); + + /** + * Wrapper used for the as bound solutions in the {@link ConcurrentHashMap}. + */ + private static class Solution { + + private final int hash; + + private final IConstant<?>[] vals; + + public Solution(final IConstant<?>[] vals) { + this.vals = vals; + this.hash = java.util.Arrays.hashCode(vals); + } + + public int hashCode() { + return hash; + } + + public boolean equals(final Object o) { + if (this == o) + return true; + if (!(o instanceof Solution)) { + return false; + } + final Solution t = (Solution) o; + if (vals.length != t.vals.length) + return false; + for (int i = 0; i < vals.length; i++) { + // @todo verify that this allows for nulls with a unit test. + if (vals[i] == t.vals[i]) + continue; + if (vals[i] == null) + return false; + if (!vals[i].equals(t.vals[i])) + return false; + } + return true; + } + } + + /** + * The variables used to impose a distinct constraint. + */ + private final IVariable<?>[] vars; + + /** + * A concurrent map whose keys are the bindings on the specified variables + * (the keys and the values are the same since the map implementation does + * not allow <code>null</code> values). + * <p> + * Note: The map is shared state and can not be discarded or cleared until + * the last invocation!!! + */ + private final ConcurrentHashMap<Solution, Solution> map; + + /** + * + * @param vars + * The set of variables on which the DISTINCT filter will be + * imposed. Only these variables will be present in the + * "accepted" solutions. Any variable bindings not specified in + * this array will be dropped). + * @param initialCapacity + * @param loadFactor + * @param concurrencyLevel + */ + public JVMDistinctFilter(final IVariable<?>[] vars, + final int initialCapacity, final float loadFactor, + final int concurrencyLevel) { + + if (vars == null) + throw new IllegalArgumentException(); + + if (vars.length == 0) + throw new IllegalArgumentException(); + + this.vars = vars; + + this.map = new ConcurrentHashMap<Solution, Solution>(initialCapacity, + loadFactor, concurrencyLevel); + + } + + /** + * If the bindings are distinct for the configured variables then return + * those bindings. + * + * @param bset + * The binding set to be filtered. + * + * @return The distinct as bound values -or- <code>null</code> if the + * binding set duplicates a solution which was already accepted. + */ + public IConstant<?>[] accept(final IBindingSet bset) { + + final IConstant<?>[] r = new IConstant<?>[vars.length]; + + for (int i = 0; i < vars.length; i++) { + + /* + * Note: This allows null's. + * + * @todo write a unit test when some variables are not bound. + */ + r[i] = bset.get(vars[i]); + + } + + final Solution s = new Solution(r); + + if (log.isTraceEnabled()) + log.trace("considering: " + Arrays.toString(r)); + + final boolean distinct = map.putIfAbsent(s, s) == null; + + if (distinct && log.isDebugEnabled()) + log.debug("accepted: " + Arrays.toString(r)); + + return distinct ? r : null; + + } + + /** + * If the bindings are distinct for the configured variables then return a + * new {@link IBindingSet} consisting of only the selected variables. + * + * @param bset + * The binding set to be filtered. + * + * @return A new {@link IBindingSet} containing only the distinct as bound + * values -or- <code>null</code> if the binding set duplicates a + * solution which was already accepted. + */ + public IBindingSet accept2(final IBindingSet bset) { + + final IConstant<?>[] vals = accept(bset); + + if (vals == null) { + + /* + * This is a duplicate solution. + */ + + return null; + + } + + /* + * This is a distinct solution. Copy only the variables used to select + * distinct solutions into a new binding set and add that to the set of + * [accepted] binding sets which will be emitted by this operator. + */ + + final ListBindingSet tmp = new ListBindingSet(); + + for (int i = 0; i < vars.length; i++) { + + if (vals[i] != null) + tmp.set(vars[i], vals[i]); + + } + + return tmp; + + } + + /** + * Discard the map backing this filter. + */ + public void clear() { + + map.clear(); + + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |