|
From: Bryan T. <tho...@us...> - 2007-04-18 17:29:13
|
Update of /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/sail In directory sc8-pr-cvs4.sourceforge.net:/tmp/cvs-serv22601/src/java/com/bigdata/rdf/sail Modified Files: SimpleRdfRepository.java Added Files: .cvsignore Options.java Log Message: testing SAIL and lubm, including adding BTree#removeAll(), touching up some inferences, making it possible to load different RDF interchange formats, and adding JOIN ordering based on the sesame optimizer and the actual triple pattern selectivity in the data. Index: SimpleRdfRepository.java =================================================================== RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/sail/SimpleRdfRepository.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** SimpleRdfRepository.java 14 Apr 2007 13:49:58 -0000 1.3 --- SimpleRdfRepository.java 18 Apr 2007 17:29:07 -0000 1.4 *************** *** 51,57 **** --- 51,62 ---- import java.io.IOException; import java.util.ArrayList; + import java.util.HashSet; + import java.util.Hashtable; + import java.util.Iterator; + import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; + import java.util.Set; import java.util.Vector; *************** *** 73,78 **** --- 78,94 ---- import org.openrdf.sesame.sail.SailUpdateException; import org.openrdf.sesame.sail.StatementIterator; + import org.openrdf.sesame.sail.query.BooleanExpr; + import org.openrdf.sesame.sail.query.DirectSubClassOf; + import org.openrdf.sesame.sail.query.DirectSubPropertyOf; + import org.openrdf.sesame.sail.query.DirectType; + import org.openrdf.sesame.sail.query.GraphPattern; + import org.openrdf.sesame.sail.query.GraphPatternQuery; + import org.openrdf.sesame.sail.query.PathExpression; import org.openrdf.sesame.sail.query.Query; import org.openrdf.sesame.sail.query.QueryOptimizer; + import org.openrdf.sesame.sail.query.SetOperator; + import org.openrdf.sesame.sail.query.TriplePattern; + import org.openrdf.sesame.sail.query.ValueCompare; + import org.openrdf.sesame.sail.query.ValueExpr; import org.openrdf.sesame.sail.query.Var; import org.openrdf.sesame.sail.util.EmptyStatementIterator; *************** *** 80,86 **** import org.openrdf.sesame.sail.util.SingleStatementIterator; import com.bigdata.btree.IEntryIterator; import com.bigdata.journal.Journal; - import com.bigdata.journal.Options; import com.bigdata.rdf.KeyOrder; import com.bigdata.rdf.TripleStore; --- 96,102 ---- import org.openrdf.sesame.sail.util.SingleStatementIterator; + import com.bigdata.btree.BTree; import com.bigdata.btree.IEntryIterator; import com.bigdata.journal.Journal; import com.bigdata.rdf.KeyOrder; import com.bigdata.rdf.TripleStore; *************** *** 111,115 **** * A custom integration is provided for directly loading data into the triple * store with throughput of 20,000+ triples per second - see ! * {@link #loadData(java.io.File, String, boolean)}. * <p> * <em>THIS CLASS IS NOT THREAD SAFE</em> --- 127,135 ---- * A custom integration is provided for directly loading data into the triple * store with throughput of 20,000+ triples per second - see ! * {@link TripleStore#loadData(File, String, org.openrdf.sesame.constants.RDFFormat, boolean, boolean)}. ! * This method lies outside of the SAIL and does NOT rely on the SAIL ! * "transaction" mechanisms. This method does NOT perform RDFS closure - you ! * must explicitly request that yourself, e.g., by specifying ! * <code>commit:=false</code> and then invoking {@link #fullForwardClosure()}. * <p> * <em>THIS CLASS IS NOT THREAD SAFE</em> *************** *** 134,140 **** protected static final long NULL = TripleStore.NULL; ! private OptimizedValueFactory valueFactory; ! private InferenceEngine tripleStore; ! private Properties properties; private boolean transactionStarted; --- 154,179 ---- protected static final long NULL = TripleStore.NULL; ! protected OptimizedValueFactory valueFactory; ! protected InferenceEngine tripleStore; ! protected Properties properties; ! ! /** ! * When true, the RDFS closure will be maintained. ! */ ! private boolean rdfsClosure; ! ! /** ! * When true, the RDFS closure will be maintained by the <em>SAIL</em> ! * implementation (but not by methods that go around the SAIL). ! */ ! public boolean isRdfsClosure() { ! ! return rdfsClosure; ! ! } ! ! /** ! * When true, a SAIL "transaction" is running. ! */ private boolean transactionStarted; *************** *** 142,146 **** * The implementation object. */ ! public TripleStore getTripleStore() { return tripleStore; --- 181,185 ---- * The implementation object. */ ! public InferenceEngine getTripleStore() { return tripleStore; *************** *** 283,289 **** /** ! * Note: This is not implemented. Since there is only one RdfRepository per ! * persistence store, the easiest way to achive this end is to delete the ! * persistence store and open/create a new one. */ public void clearRepository() throws SailUpdateException { --- 322,328 ---- /** ! * Note: Since there is only one RdfRepository per persistence store, the ! * easiest way to achive this end is to delete the persistence store and ! * open/create a new one. */ public void clearRepository() throws SailUpdateException { *************** *** 291,300 **** assertTransactionStarted(); ! // TODO Auto-generated method stub m_stmtRemoved = true; - throw new UnsupportedOperationException(); - } --- 330,341 ---- assertTransactionStarted(); ! ((BTree)tripleStore.getTermIdIndex()).removeAll(); ! ((BTree)tripleStore.getIdTermIndex()).removeAll(); ! ((BTree)tripleStore.getSPOIndex()).removeAll(); ! ((BTree)tripleStore.getPOSIndex()).removeAll(); ! ((BTree)tripleStore.getOSPIndex()).removeAll(); m_stmtRemoved = true; } *************** *** 367,377 **** /* ! * @todo make closure optional? Is closure required by the Sesame test ! * suite? Mark entailments in the KB using the btree entry value? */ ! tripleStore.fullForwardClosure(); tripleStore.commit(); transactionStarted = false; --- 408,427 ---- /* ! * Optionally compute the full forward closure over the store. ! * ! * @todo In order to handle closure properly we should delete the ! * closure if any statements are removed from the store (the entailments ! * are not explicitly marked so this SAIL does not do this). */ ! if (rdfsClosure) { ! ! tripleStore.fullForwardClosure(); ! ! } tripleStore.commit(); + if(true) tripleStore.dumpStore(); + transactionStarted = false; *************** *** 534,538 **** /** ! * @todo rewrite the join order based on selectivity. */ public Query optimizeQuery(Query query) { --- 584,588 ---- /** ! * */ public Query optimizeQuery(Query query) { *************** *** 543,547 **** */ ! QueryOptimizer.optimizeQuery(query); /* --- 593,605 ---- */ ! // QueryOptimizer.optimizeQuery(query); ! ! /* ! * This variant is based on the Sesame optimizer but it uses range ! * counts to order triple patterns based on their actual selectiveness ! * in the data at the time that the query is run. This can be a big ! * win depending on the query. ! */ ! optimizeQuery2(query); /* *************** *** 557,562 **** --- 615,978 ---- } + + /** + * An attempt to get the Sesame query optimizer to choose the join + * order based on the actual selectivity of the triple patterns. + * + * @param qc + */ + private void optimizeQuery2(Query qc) { + if (qc instanceof GraphPatternQuery) { + GraphPatternQuery gpQuery = (GraphPatternQuery)qc; + _optimizeGraphPattern(gpQuery.getGraphPattern(), new HashSet()); + } + else if (qc instanceof SetOperator) { + SetOperator setOp = (SetOperator)qc; + optimizeQuery( setOp.getLeftArg() ); + optimizeQuery( setOp.getRightArg() ); + } + } + + private void _optimizeGraphPattern(GraphPattern graphPattern, Set boundVars) { + // Optimize any optional child graph patterns: + Iterator iter = graphPattern.getOptionals().iterator(); + + if (iter.hasNext()) { + // Build set of variables that are bound in this scope + Set scopeVars = new HashSet(boundVars); + graphPattern.getLocalVariables(scopeVars); + + // Optimize recursively + while (iter.hasNext()) { + GraphPattern optionalGP = (GraphPattern)iter.next(); + _optimizeGraphPattern(optionalGP, new HashSet(scopeVars)); + } + } + + // Optimize the GraphPattern itself: + _inlineVarAssignments(graphPattern); + _orderExpressions(graphPattern, boundVars); + } + + /** + * Inlines as much of the "variable assignments" (comparison between a + * variable and fixed value) that are found in the list of conjunctive + * constraints as possible, and removes them from the query. Only variable + * assignments for variables that are used in <tt>graphPattern</tt> itself + * are processed. Inlining variable assignments for variables that are + * (only) used in optional child graph patterns leads to incorrect query + * evaluation. + **/ + private void _inlineVarAssignments(GraphPattern graphPattern) { + Set localVars = new HashSet(); + graphPattern.getLocalVariables(localVars); + + boolean constraintsModified = false; + + List conjunctiveConstraints = + new ArrayList(graphPattern.getConjunctiveConstraints()); + + Iterator iter = conjunctiveConstraints.iterator(); + + while (iter.hasNext()) { + BooleanExpr boolExpr = (BooleanExpr)iter.next(); + + if (boolExpr instanceof ValueCompare) { + ValueCompare valueCompare = (ValueCompare)boolExpr; + + if (valueCompare.getOperator() != ValueCompare.EQ) { + continue; + } + + ValueExpr arg1 = valueCompare.getLeftArg(); + ValueExpr arg2 = valueCompare.getRightArg(); + + Var varArg = null; + Value value = null; + + if (arg1 instanceof Var && arg1.getValue() == null && // arg1 is an unassigned var + arg2.getValue() != null) // arg2 has a value + { + varArg = (Var)arg1; + value = arg2.getValue(); + } + else if (arg2 instanceof Var && arg2.getValue() == null && // arg2 is an unassigned var + arg1.getValue() != null) // arg1 has a value + { + varArg = (Var)arg2; + value = arg1.getValue(); + } + + if (varArg != null && localVars.contains(varArg)) { + // Inline this variable assignment + varArg.setValue(value); + + // Remove the (now redundant) constraint + iter.remove(); + + constraintsModified = true; + } + } + } + + if (constraintsModified) { + graphPattern.setConstraints(conjunctiveConstraints); + } + } + + /** + * Merges the boolean constraints and the path expressions in one single + * list. The order of the path expressions is not changed, but the boolean + * constraints are inserted between them. The separate boolean constraints + * are moved to the start of the list as much as possible, under the + * condition that all variables that are used in the constraint are + * instantiated by the path expressions that are earlier in the list. An + * example combined list might be: + * <tt>[(A,B,C), A != foo:bar, (B,E,F), C != F, (F,G,H)]</tt>. + **/ + private void _orderExpressions(GraphPattern graphPattern, Set boundVars) { + List expressions = new ArrayList(); + List conjunctiveConstraints = new LinkedList(graphPattern.getConjunctiveConstraints()); + + // First evaluate any constraints that don't depend on any variables: + _addVerifiableConstraints(conjunctiveConstraints, boundVars, expressions); + + // Then evaluate all path expressions from graphPattern + List pathExpressions = new LinkedList(graphPattern.getPathExpressions()); + Hashtable<PathExpression,Integer> rangeCounts = new Hashtable<PathExpression, Integer>(); + while (!pathExpressions.isEmpty()) { + PathExpression pe = _getMostSpecificPathExpression(pathExpressions, boundVars, rangeCounts); + + pathExpressions.remove(pe); + expressions.add(pe); + + pe.getVariables(boundVars); + + _addVerifiableConstraints(conjunctiveConstraints, boundVars, expressions); + } + + // Finally, evaluate any optional child graph pattern lists + List optionals = new LinkedList(graphPattern.getOptionals()); + while (!optionals.isEmpty()) { + PathExpression pe = _getMostSpecificPathExpression(optionals, boundVars, rangeCounts); + + optionals.remove(pe); + expressions.add(pe); + + pe.getVariables(boundVars); + + _addVerifiableConstraints(conjunctiveConstraints, boundVars, expressions); + } + + // All constraints should be verifiable when all path expressions are + // evaluated, but add any remaining constraints anyway + expressions.addAll(conjunctiveConstraints); + + graphPattern.setExpressions(expressions); + } + + /** + * Gets the most specific path expression from <tt>pathExpressions</tt> + * given that the variables in <tt>boundVars</tt> have already been assigned + * values. The most specific path expressions is the path expression with + * the least number of unbound variables. + **/ + private PathExpression _getMostSpecificPathExpression( + List pathExpressions, Set boundVars, Hashtable<PathExpression,Integer> rangeCounts) + { + int minVars = Integer.MAX_VALUE; + int minRangeCount = Integer.MAX_VALUE; + PathExpression result = null; + ArrayList vars = new ArrayList(); + + for (int i = 0; i < pathExpressions.size(); i++) { + PathExpression pe = (PathExpression)pathExpressions.get(i); + + /* + * The #of results for this PathException or -1 if not a + * TriplePattern. + * + * @todo if zero (0), then at least order it first. + */ + int rangeCount = getRangeCount(pe,rangeCounts); + + // Get the variables that are used in this path expression + vars.clear(); + pe.getVariables(vars); + + // Count unbound variables + int varCount = 0; + for (int j = 0; j < vars.size(); j++) { + Var var = (Var)vars.get(j); + + if (!var.hasValue() && !boundVars.contains(var)) { + varCount++; + } + } + + // A bit of hack to make sure directType-, directSubClassOf- and + // directSubPropertyOf patterns get sorted to the back because these + // are potentially more expensive to evaluate. + if (pe instanceof DirectType || + pe instanceof DirectSubClassOf || + pe instanceof DirectSubPropertyOf) + { + varCount++; + } + + if (rangeCount != -1) { + // rangeCount is known for this path expression. + if (rangeCount < minRangeCount) { + // More specific path expression found + minRangeCount = rangeCount; + result = pe; + } + } else { + // rangeCount is NOT known. + if (varCount < minVars) { + // More specific path expression found + minVars = varCount; + result = pe; + } + } + } + + return result; + } + + /** + * Estimate the #of results for a triple pattern. + * + * @param pe + * @param rangeCounts + * + * @return The estimated range count or <code>-1</code> if this is not a + * {@link TriplePattern}. + */ + private int getRangeCount(PathExpression pe,Hashtable<PathExpression,Integer> rangeCounts) { + + Integer rangeCount = rangeCounts.get(pe); + + if(rangeCount!=null) { + + return rangeCount; + + } + + if(pe instanceof TriplePattern) { + + TriplePattern tp = (TriplePattern)pe; + + rangeCount = rangeCount(tp); + + System.err.println("rangeCount: " + rangeCount + " : " + pe); + + rangeCounts.put(pe,rangeCount); + + return rangeCount.intValue(); + + } else { + + return -1; + + } + + } + + private int rangeCount(TriplePattern tp) { + + /* + * Extract "variables". If hasValue() is true, then the variable is + * actually bound. + */ + Var svar = tp.getSubjectVar(); + Var pvar = tp.getPredicateVar(); + Var ovar = tp.getObjectVar(); + + /* + * Extract binding for variable or null iff not bound. + */ + Resource s = svar.hasValue()?(Resource)svar.getValue():null; + + URI p = pvar.hasValue()?(URI)pvar.getValue():null; + + Value o = ovar.hasValue()?ovar.getValue():null; + + /* + * convert other Value object types to our object types. + */ + if (s != null) + s = (Resource) valueFactory.toNativeValue(s); + + if (p != null) + p = (URI) valueFactory.toNativeValue(p); + + if (o != null) + o = (Value) valueFactory.toNativeValue(o); + + /* + * convert our object types to internal identifiers. + */ + long _s, _p, _o; + + _s = (s == null ? NULL : tripleStore.getTermId(s)); + _p = (p == null ? NULL : tripleStore.getTermId(p)); + _o = (o == null ? NULL : tripleStore.getTermId(o)); + + /* + * If a value was specified and it is not in the terms index then the + * statement can not exist in the KB. + */ + if (_s == NULL && s != null) { + + return 0; + + } + + if (_p == NULL && p != null) { + + return 0; + + } + + if (_o == NULL && o != null) { + + return 0; + + } + + return tripleStore.rangeCount(_s, _p, _o); + + } /** + * Adds all verifiable constraints (constraint for which every variable has + * been bound to a specific value) from <tt>conjunctiveConstraints</tt> to + * <tt>expressions</tt>. + * + * @param conjunctiveConstraints A List of BooleanExpr objects. + * @param boundVars A Set of Var objects that have been bound. + * @param expressions The list to add the verifiable constraints to. + **/ + private void _addVerifiableConstraints( + List conjunctiveConstraints, Set boundVars, List expressions) + { + Iterator iter = conjunctiveConstraints.iterator(); + + while (iter.hasNext()) { + BooleanExpr constraint = (BooleanExpr)iter.next(); + + Set constraintVars = new HashSet(); + constraint.getVariables(constraintVars); + + if (boundVars.containsAll(constraintVars)) { + // constraint can be verified + expressions.add(constraint); + iter.remove(); + } + } + } + + /** * Replace all {@link Value} objects stored in variables with the * corresponding {@link _Value} objects. *************** *** 654,659 **** * selection of this option is NOT restart safe (it is not saved * in the store). - * - * @see #loadData(File, String, boolean) */ public void initialize(Map configParams) throws SailInitializationException { --- 1070,1073 ---- *************** *** 661,664 **** --- 1075,1095 ---- properties = PropertyUtil.flatCopy(PropertyUtil.convert(configParams)); + String val; + + val = properties.getProperty(Options.RDFS_CLOSURE); + + if (val != null) { + + rdfsClosure = Boolean.parseBoolean(val); + + } else { + + // No closure by default. + rdfsClosure = false; + + } + + System.err.println(Options.RDFS_CLOSURE+"="+rdfsClosure); + valueFactory = new OptimizedValueFactory(); *************** *** 681,713 **** } ! /** ! * Uses a fast batch loader to load the data into the store. ! * <p> ! * This method lies outside of the SAIL and does not rely on the SAIL ! * "transaction" mechanisms. ! * ! * @param file ! * The file containing the data. ! * @param baseURI ! * The base URI (optional). ! * @param commit ! * When true, the store will be committed after the data load ! * (you can use false here if you will lots of small files in ! * sequence). ! */ ! public void loadData(File file, String baseURI, boolean commit) ! throws IOException { ! ! tripleStore.loadData(file, baseURI, commit); ! ! } /** ! * Computes the closure of the triple store for RDFS entailments. This ! * implementation computes the full forward closure of the store and then ! * commits the store. Since incremental closure and truth maintenance are ! * NOT supported, you should load all your data into store using ! * {@link #loadData(File, String, boolean)} and then invoke this method to ! * compute the RDFS entailments. * <p> * This method lies outside of the SAIL and does not rely on the SAIL --- 1112,1149 ---- } ! // /** ! // * Uses a fast batch loader to load the data into the store. ! // * <p> ! // * This method lies outside of the SAIL and does not rely on the SAIL ! // * "transaction" mechanisms. ! // * <p> ! // * This method does NOT perform RDFS closure - you must explicitly request ! // * that yourself, e.g., by specifying <code>commit:=false</code> here and ! // * then invoking {@link #fullForwardClosure()}. ! // * ! // * @param file ! // * The file containing the data. ! // * @param baseURI ! // * The base URI (optional). ! // * @param commit ! // * When true, the store will be committed after the data load ! // * (you can use false here if you will lots of small files in ! // * sequence). ! // */ ! // public void loadData(File file, String baseURI, boolean commit) ! // throws IOException { ! // ! // tripleStore.loadData(file, baseURI, commit); ! // ! // } /** ! * Computes the closure of the triple store for RDFS entailments. ! * <p> ! * This computes the full forward closure of the store and then commits the ! * store. Since incremental closure and truth maintenance are NOT supported, ! * you should load all your data into store using ! * {@link TripleStore#loadData(File, String, org.openrdf.sesame.constants.RDFFormat, boolean, boolean)} ! * and then invoke this method to compute the RDFS entailments. * <p> * This method lies outside of the SAIL and does not rely on the SAIL --- NEW FILE: .cvsignore --- SimpleRdfsRepository.java --- NEW FILE: Options.java --- /** The Notice below must appear in each file of the Source Code of any copy you distribute of the Licensed Product. Contributors to any Modifications may add their own copyright notices to identify their own contributions. License: The contents of this file are subject to the CognitiveWeb Open Source License Version 1.1 (the License). You may not copy or use this file, in either source code or executable form, except in compliance with the License. You may obtain a copy of the License from http://www.CognitiveWeb.org/legal/license/ Software distributed under the License is distributed on an AS IS basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. Copyrights: Portions created by or assigned to CognitiveWeb are Copyright (c) 2003-2003 CognitiveWeb. All Rights Reserved. Contact information for CognitiveWeb is available at http://www.CognitiveWeb.org Portions Copyright (c) 2002-2003 Bryan Thompson. Acknowledgements: Special thanks to the developers of the Jabber Open Source License 1.0 (JOSL), from which this License was derived. This License contains terms that differ from JOSL. Special thanks to the CognitiveWeb Open Source Contributors for their suggestions and support of the Cognitive Web. Modifications: */ /* * Created on Apr 17, 2007 */ package com.bigdata.rdf.sail; import com.bigdata.rdf.TripleStore; /** * Additional parameters understood by the Sesame 1.x SAIL implementation. * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> * @version $Id$ */ public class Options extends com.bigdata.journal.Options { /** * This optional boolean property may be used to specify whether or not RDFS * entailments are maintained by eager closure of the knowledge base * (default false). */ public static final String RDFS_CLOSURE = "rdfsClosure"; } |