From: <mrp...@us...> - 2010-12-21 15:09:14
|
Revision: 4028 http://bigdata.svn.sourceforge.net/bigdata/?rev=4028&view=rev Author: mrpersonick Date: 2010-12-21 15:09:07 +0000 (Tue, 21 Dec 2010) Log Message: ----------- expanding free text search functionality to include search metadata support via SPARQL Modified Paths: -------------- branches/JOURNAL_HA_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl2.java branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java branches/JOURNAL_HA_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java Modified: branches/JOURNAL_HA_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java =================================================================== --- branches/JOURNAL_HA_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java 2010-12-21 15:05:59 UTC (rev 4027) +++ branches/JOURNAL_HA_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java 2010-12-21 15:09:07 UTC (rev 4028) @@ -63,8 +63,10 @@ /** * The namespace used for bigdata specific extensions. */ - String NAMESPACE = "http://www.bigdata.com/rdf#"; + final String NAMESPACE = "http://www.bigdata.com/rdf#"; + final String SEARCH_NAMESPACE = "http://www.bigdata.com/rdf/search#"; + /** * The namespace prefix used in SPARQL queries to signify query hints. You * can embed query hints into a SPARQL query as follows: @@ -150,7 +152,13 @@ * Note: The context position should be unbound when using statement * identifiers. */ - URI SEARCH = new URIImpl(NAMESPACE+"search"); + final URI SEARCH = new URIImpl(SEARCH_NAMESPACE+"search"); + + final URI RELEVANCE = new URIImpl(SEARCH_NAMESPACE+"relevance"); + + final URI RANK = new URIImpl(SEARCH_NAMESPACE+"rank"); + + final URI NUM_MATCHED_TOKENS = new URIImpl(SEARCH_NAMESPACE+"numMatchedTokens"); /** * Sesame has the notion of a "null" graph. Any time you insert a statement Modified: branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl2.java =================================================================== --- branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl2.java 2010-12-21 15:05:59 UTC (rev 4027) +++ branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl2.java 2010-12-21 15:09:07 UTC (rev 4028) @@ -8,6 +8,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -751,11 +752,50 @@ // problem final Map<IPredicate, StatementPattern> searches = new HashMap<IPredicate, StatementPattern>(); + + final Set<StatementPattern> searchMetadata1 = + new LinkedHashSet<StatementPattern>(); + final Map<Var, Set<StatementPattern>> searchMetadata2 = + new LinkedHashMap<Var, Set<StatementPattern>>(); + Iterator<Map.Entry<StatementPattern, Boolean>> it = + stmtPatterns.entrySet().iterator(); + while (it.hasNext()) { + final StatementPattern sp = it.next().getKey(); + final Value s = sp.getSubjectVar().getValue(); + final Value p = sp.getPredicateVar().getValue(); + final Value o = sp.getObjectVar().getValue(); + if (s == null && p != null && o != null) { + if (BD.SEARCH.equals(p)) { + searchMetadata1.add(sp); + searchMetadata2.put(sp.getSubjectVar(), + new LinkedHashSet<StatementPattern>()); + it.remove(); + } + } + } + it = stmtPatterns.entrySet().iterator(); + while (it.hasNext()) { + final StatementPattern sp = it.next().getKey(); + final Value s = sp.getSubjectVar().getValue(); + final Value p = sp.getPredicateVar().getValue(); + final Value o = sp.getObjectVar().getValue(); + if (s == null && p != null && o == null) { + if (BD.RELEVANCE.equals(p)) { + final Var sVar = sp.getSubjectVar(); + Set<StatementPattern> metadata = searchMetadata2.get(sVar); + if (metadata != null) { + metadata.add(sp); + } + it.remove(); + } + } + } + for (Map.Entry<StatementPattern, Boolean> entry : stmtPatterns .entrySet()) { - StatementPattern sp = entry.getKey(); - boolean optional = entry.getValue(); - IPredicate tail = generateTail(sp, optional); + final StatementPattern sp = entry.getKey(); + final boolean optional = entry.getValue(); + final IPredicate tail = generateTail(sp, optional); // encountered a value not in the database lexicon if (tail == null) { if (log.isDebugEnabled()) { @@ -769,12 +809,17 @@ return null; } } - if (tail.getSolutionExpander() instanceof FreeTextSearchExpander) { - searches.put(tail, sp); - } tails.add(tail); } + for (StatementPattern sp : searchMetadata1) { + final Set<StatementPattern> metadata = + searchMetadata2.get(sp.getSubjectVar()); + final IPredicate tail = generateSearchTail(sp, metadata); + searches.put(tail, sp); + tails.add(tail); + } + /* * When in quads mode, we need to go through the free text searches and * make sure that they are properly filtered for the dataset where @@ -826,7 +871,7 @@ boolean needsFilter = true; // check the other tails one by one for (IPredicate<ISPO> tail : tails) { - ISolutionExpander<ISPO> expander = + final ISolutionExpander<ISPO> expander = tail.getSolutionExpander(); // only concerned with non-optional tails that are not // themselves magic searches @@ -837,7 +882,7 @@ // see if the search variable appears in this tail boolean appears = false; for (int i = 0; i < tail.arity(); i++) { - IVariableOrConstant term = tail.get(i); + final IVariableOrConstant term = tail.get(i); if (log.isDebugEnabled()) { log.debug(term); } @@ -857,8 +902,8 @@ if (log.isDebugEnabled()) { log.debug("needs filter: " + searchVar); } - FreeTextSearchExpander expander = (FreeTextSearchExpander) - search.getSolutionExpander(); + final FreeTextSearchExpander expander = + (FreeTextSearchExpander) search.getSolutionExpander(); expander.addNamedGraphsFilter(graphs); } } @@ -907,9 +952,9 @@ IAccessPath<ISPO> accessPath = database.getSPORelation() .getAccessPath(tail); accessPath = expander.getAccessPath(accessPath); - IChunkedOrderedIterator<ISPO> it = accessPath.iterator(); - while (it.hasNext()) { - log.debug(it.next().toString(database)); + IChunkedOrderedIterator<ISPO> it1 = accessPath.iterator(); + while (it1.hasNext()) { + log.debug(it1.next().toString(database)); } } } @@ -1257,23 +1302,6 @@ private IPredicate generateTail(final StatementPattern stmtPattern, final boolean optional) throws QueryEvaluationException { - // create a solution expander for free text search if necessary - ISolutionExpander<ISPO> expander = null; - final Value predValue = stmtPattern.getPredicateVar().getValue(); - if (log.isDebugEnabled()) { - log.debug(predValue); - } - if (predValue != null && BD.SEARCH.equals(predValue)) { - final Value objValue = stmtPattern.getObjectVar().getValue(); - if (log.isDebugEnabled()) { - log.debug(objValue); - } - if (objValue != null && objValue instanceof Literal) { - expander = new FreeTextSearchExpander(database, - (Literal) objValue); - } - } - // @todo why is [s] handled differently? // because [s] is the variable in free text searches, no need to test // to see if the free text search expander is in place @@ -1283,26 +1311,20 @@ return null; } - final IVariableOrConstant<IV> p; - if (expander == null) { - p = generateVariableOrConstant(stmtPattern.getPredicateVar()); - } else { - p = new Constant(DummyIV.INSTANCE); - } + final IVariableOrConstant<IV> p = generateVariableOrConstant( + stmtPattern.getPredicateVar()); if (p == null) { return null; } - final IVariableOrConstant<IV> o; - if (expander == null) { - o = generateVariableOrConstant(stmtPattern.getObjectVar()); - } else { - o = new Constant(DummyIV.INSTANCE); - } + final IVariableOrConstant<IV> o = generateVariableOrConstant( + stmtPattern.getObjectVar()); if (o == null) { return null; } - + + // for default and named graph expansion + ISolutionExpander<ISPO> expander = null; final IVariableOrConstant<IV> c; if (!database.isQuads()) { /* @@ -1361,79 +1383,68 @@ } System.err.println(stmtPattern.toString()); } - if (expander != null) { - /* - * @todo can this happen? If it does then we need to look at how - * to layer the expanders. - */ - // throw new AssertionError("expander already set"); - // we are doing a free text search, no need to do any named or - // default graph expansion work - c = null; - } else { - final Var cvar = stmtPattern.getContextVar(); - if (dataset == null) { - if (cvar == null) { - /* - * There is no dataset and there is no graph variable, - * so the default graph will be the RDF Merge of ALL - * graphs in the quad store. - * - * This code path uses an "expander" which strips off - * the context information and filters for the distinct - * (s,p,o) triples to realize the RDF Merge of the - * source graphs for the default graph. - */ + final Var cvar = stmtPattern.getContextVar(); + if (dataset == null) { + if (cvar == null) { + /* + * There is no dataset and there is no graph variable, + * so the default graph will be the RDF Merge of ALL + * graphs in the quad store. + * + * This code path uses an "expander" which strips off + * the context information and filters for the distinct + * (s,p,o) triples to realize the RDF Merge of the + * source graphs for the default graph. + */ + c = null; + expander = new DefaultGraphSolutionExpander(null/* ALL */); + } else { + /* + * There is no data set and there is a graph variable, + * so the query will run against all named graphs and + * [cvar] will be to the context of each (s,p,o,c) in + * turn. This handles constructions such as: + * + * "SELECT * WHERE {graph ?g {?g :p :o } }" + */ + expander = new NamedGraphSolutionExpander(null/* ALL */); + c = generateVariableOrConstant(cvar); + } + } else { // dataset != null + switch (stmtPattern.getScope()) { + case DEFAULT_CONTEXTS: { + /* + * Query against the RDF merge of zero or more source + * graphs. + */ + expander = new DefaultGraphSolutionExpander(dataset + .getDefaultGraphs()); + /* + * Note: cvar can not become bound since context is + * stripped for the default graph. + */ + if (cvar == null) c = null; - expander = new DefaultGraphSolutionExpander(null/* ALL */); + else + c = generateVariableOrConstant(cvar); + break; + } + case NAMED_CONTEXTS: { + /* + * Query against zero or more named graphs. + */ + expander = new NamedGraphSolutionExpander(dataset + .getNamedGraphs()); + if (cvar == null) {// || !cvar.hasValue()) { + c = null; } else { - /* - * There is no data set and there is a graph variable, - * so the query will run against all named graphs and - * [cvar] will be to the context of each (s,p,o,c) in - * turn. This handles constructions such as: - * - * "SELECT * WHERE {graph ?g {?g :p :o } }" - */ - expander = new NamedGraphSolutionExpander(null/* ALL */); c = generateVariableOrConstant(cvar); } - } else { // dataset != null - switch (stmtPattern.getScope()) { - case DEFAULT_CONTEXTS: { - /* - * Query against the RDF merge of zero or more source - * graphs. - */ - expander = new DefaultGraphSolutionExpander(dataset - .getDefaultGraphs()); - /* - * Note: cvar can not become bound since context is - * stripped for the default graph. - */ - if (cvar == null) - c = null; - else - c = generateVariableOrConstant(cvar); - break; - } - case NAMED_CONTEXTS: { - /* - * Query against zero or more named graphs. - */ - expander = new NamedGraphSolutionExpander(dataset - .getNamedGraphs()); - if (cvar == null) {// || !cvar.hasValue()) { - c = null; - } else { - c = generateVariableOrConstant(cvar); - } - break; - } - default: - throw new AssertionError(); - } + break; } + default: + throw new AssertionError(); + } } } @@ -1456,6 +1467,63 @@ s, p, o, c, optional, // optional filter, // filter on elements visited by the access path. + expander // named graphs expander + ); + + } + + private IPredicate generateSearchTail(final StatementPattern sp, + final Set<StatementPattern> metadata) + throws QueryEvaluationException { + + final Value predValue = sp.getPredicateVar().getValue(); + if (log.isDebugEnabled()) { + log.debug(predValue); + } + if (predValue == null || !BD.SEARCH.equals(predValue)) { + throw new IllegalArgumentException("not a valid magic search: " + sp); + } + final Value objValue = sp.getObjectVar().getValue(); + if (log.isDebugEnabled()) { + log.debug(objValue); + } + if (objValue == null || !(objValue instanceof Literal)) { + throw new IllegalArgumentException("not a valid magic search: " + sp); + } + + final ISolutionExpander expander = + new FreeTextSearchExpander(database, (Literal) objValue); + + final Var subjVar = sp.getSubjectVar(); + + final IVariableOrConstant<IV> search = + com.bigdata.relation.rule.Var.var(subjVar.getName()); + + IVariableOrConstant<IV> relevance = new Constant(DummyIV.INSTANCE); + + for (StatementPattern meta : metadata) { + if (!meta.getSubjectVar().equals(subjVar)) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + final Value pVal = meta.getPredicateVar().getValue(); + final Var oVar = meta.getObjectVar(); + if (pVal == null || oVar.hasValue()) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + if (BD.RELEVANCE.equals(pVal)) { + relevance = com.bigdata.relation.rule.Var.var(oVar.getName()); + } + } + + return new SPOPredicate( + new String[] { database.getSPORelation().getNamespace() }, + -1, // partitionId + search, // s = searchVar + relevance, // p = relevanceVar + new Constant(DummyIV.INSTANCE), // o = reserved + new Constant(DummyIV.INSTANCE), // c = reserved + false, // optional + null, // filter on elements visited by the access path. expander // free text search expander or named graphs expander ); @@ -1707,6 +1775,12 @@ /** * Override evaluation of StatementPatterns to recognize magic search * predicate. + * + * select * + * where { + * ?s bd:search "foo" . + * ?s bd:score ?score . + * } */ @Override public CloseableIteration<BindingSet, QueryEvaluationException> evaluate( Modified: branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java =================================================================== --- branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2010-12-21 15:05:59 UTC (rev 4027) +++ branches/JOURNAL_HA_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2010-12-21 15:09:07 UTC (rev 4028) @@ -13,6 +13,7 @@ import com.bigdata.rdf.internal.IV; import com.bigdata.rdf.internal.TermId; import com.bigdata.rdf.internal.VTE; +import com.bigdata.rdf.internal.XSDDoubleIV; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.spo.ISPO; import com.bigdata.rdf.spo.SPO; @@ -305,9 +306,12 @@ } ISPO[] spos = new ISPO[hits.length]; for (int i = 0; i < hits.length; i++) { - IV s = new TermId(VTE.LITERAL, hits[i].getDocId()); - if (INFO) log.info("hit: " + s); - spos[i] = new SPO(s, null, null); + final IV s = new TermId(VTE.LITERAL, hits[i].getDocId()); + final IV p = new XSDDoubleIV(hits[i].getCosine()); + final IV o = null; // reserved + final IV c = null; // reserved + spos[i] = new SPO(s, p, o, c); + if (INFO) log.info("hit: " + spos[i]); } // Arrays.sort(spos, SPOKeyOrder.SPO.getComparator()); return spos; @@ -316,9 +320,12 @@ private ISPO[] convertWhenBound(IHit[] hits) { ISPO[] result = new ISPO[0]; for (IHit hit : hits) { - IV s = new TermId(VTE.LITERAL, hit.getDocId()); + final IV s = new TermId(VTE.LITERAL, hit.getDocId()); if (s == boundVal) { - result = new ISPO[] { new SPO(s, null, null) }; + final IV p = new XSDDoubleIV(hit.getCosine()); + final IV o = null; // reserved + final IV c = null; // reserved + result = new ISPO[] { new SPO(s, p, o, c) }; break; } } Modified: branches/JOURNAL_HA_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java =================================================================== --- branches/JOURNAL_HA_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2010-12-21 15:05:59 UTC (rev 4027) +++ branches/JOURNAL_HA_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2010-12-21 15:09:07 UTC (rev 4028) @@ -40,10 +40,10 @@ import org.openrdf.model.BNode; import org.openrdf.model.Graph; import org.openrdf.model.Literal; -import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; +import org.openrdf.model.ValueFactory; import org.openrdf.model.impl.BNodeImpl; import org.openrdf.model.impl.GraphImpl; import org.openrdf.model.impl.LiteralImpl; @@ -665,4 +665,89 @@ } + public void testWithRelevance() throws Exception { + + final BigdataSail sail = getSail(); + try { + + sail.initialize(); + final BigdataSailRepository repo = new BigdataSailRepository(sail); + final BigdataSailRepositoryConnection cxn = + (BigdataSailRepositoryConnection) repo.getConnection(); + cxn.setAutoCommit(false); + + try { + + final ValueFactory vf = sail.getValueFactory(); + + final URI s1 = vf.createURI(BD.NAMESPACE+"s1"); + final URI s2 = vf.createURI(BD.NAMESPACE+"s2"); + final URI s3 = vf.createURI(BD.NAMESPACE+"s3"); + final URI s4 = vf.createURI(BD.NAMESPACE+"s4"); + final URI s5 = vf.createURI(BD.NAMESPACE+"s5"); + final URI s6 = vf.createURI(BD.NAMESPACE+"s6"); + final URI s7 = vf.createURI(BD.NAMESPACE+"s7"); + final Literal l1 = vf.createLiteral("how"); + final Literal l2 = vf.createLiteral("now"); + final Literal l3 = vf.createLiteral("brown"); + final Literal l4 = vf.createLiteral("cow"); + final Literal l5 = vf.createLiteral("how now"); + final Literal l6 = vf.createLiteral("brown cow"); + final Literal l7 = vf.createLiteral("how now brown cow"); + + cxn.add(s1, RDFS.LABEL, l1); + cxn.add(s2, RDFS.LABEL, l2); + cxn.add(s3, RDFS.LABEL, l3); + cxn.add(s4, RDFS.LABEL, l4); + cxn.add(s5, RDFS.LABEL, l5); + cxn.add(s6, RDFS.LABEL, l6); + cxn.add(s7, RDFS.LABEL, l7); + + /* + * Note: The either flush() or commit() is required to flush the + * statement buffers to the database before executing any operations + * that go around the sail. + */ + cxn.commit(); + +/**/ + if (log.isInfoEnabled()) { + log.info("\n" + sail.getDatabase().dumpStore()); + } + + { // run the query with no graphs specified + final String query = + "select ?s ?o ?score " + + "where " + + "{ " + + " ?s <"+RDFS.LABEL+"> ?o . " + + " ?o <"+BD.SEARCH+"> \"how now brown cow\" . " + + " ?o <"+BD.RELEVANCE+"> ?score . " + + "}"; + + final TupleQuery tupleQuery = + cxn.prepareTupleQuery(QueryLanguage.SPARQL, query); + tupleQuery.setIncludeInferred(true /* includeInferred */); + TupleQueryResult result = tupleQuery.evaluate(); + + while (result.hasNext()) { + System.err.println(result.next()); + } + + result = tupleQuery.evaluate(); +// Collection<BindingSet> answer = new LinkedList<BindingSet>(); +// answer.add(createBindingSet(new BindingImpl("s", alice))); +// +// compare(result, answer); + } + + } finally { + cxn.close(); + } + } finally { + sail.__tearDownUnitTest(); + } + + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |