From: <mrp...@us...> - 2011-01-25 16:01:22
|
Revision: 4164 http://bigdata.svn.sourceforge.net/bigdata/?rev=4164&view=rev Author: mrpersonick Date: 2011-01-25 16:01:15 +0000 (Tue, 25 Jan 2011) Log Message: ----------- bringing over new free text search features from HA branch Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-01-22 21:13:16 UTC (rev 4163) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-01-25 16:01:15 UTC (rev 4164) @@ -7,8 +7,11 @@ import java.util.Collection; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.TimeUnit; @@ -615,6 +618,68 @@ final Collection<SOpGroup> groupsToPrune = new LinkedList<SOpGroup>(); /* + * We need to prune Sesame filters that we cannot translate into native + * constraints (ones that require lexicon joins). We also need to + * prune search metadata tails. + */ + final Collection<SOp> sopsToPrune = new LinkedList<SOp>(); + + /* + * deal with free text search tails first. need to match up search + * metadata tails with the searches themselves. ie: + * + * select * + * where { + * ?s bd:search "foo" . + * ?s bd:relevance ?score . + * } + */ + // the statement patterns for metadata about the searches + final Map<Var, Set<StatementPattern>> searchMetadata = + new LinkedHashMap<Var, Set<StatementPattern>>(); + // do a first pass to gather up the actual searches and take them out + // of the master list of statement patterns + for (SOp sop : sopTree) { + final QueryModelNode op = sop.getOperator(); + if (op instanceof StatementPattern) { + final StatementPattern sp = (StatementPattern) op; + final Value s = sp.getSubjectVar().getValue(); + final Value p = sp.getPredicateVar().getValue(); + final Value o = sp.getObjectVar().getValue(); + if (s == null && p != null && o != null && + BD.SEARCH.equals(p)) { + searchMetadata.put(sp.getSubjectVar(), + new LinkedHashSet<StatementPattern>()); + } + } + } + // do a second pass to get the search metadata + for (SOp sop : sopTree) { + final QueryModelNode op = sop.getOperator(); + if (op instanceof StatementPattern) { + final StatementPattern sp = (StatementPattern) op; + final Value s = sp.getSubjectVar().getValue(); + final Value p = sp.getPredicateVar().getValue(); + if (s == null && p != null && + (BD.RELEVANCE.equals(p) || BD.MAX_HITS.equals(p) || + BD.MIN_RELEVANCE.equals(p))) { + final Var sVar = sp.getSubjectVar(); + Set<StatementPattern> metadata = searchMetadata.get(sVar); + if (metadata != null) { + metadata.add(sp); + } + sopsToPrune.add(sop); + } + } + } + + /* + * Prunes the sop tree of search metadata. + */ + sopTree = stb.pruneSOps(sopTree, sopsToPrune); + sopsToPrune.clear(); + + /* * Iterate through the sop tree and translate statement patterns into * predicates. */ @@ -622,8 +687,16 @@ final QueryModelNode op = sop.getOperator(); if (op instanceof StatementPattern) { final StatementPattern sp = (StatementPattern) op; + final Value p = sp.getPredicateVar().getValue(); try { - final IPredicate bop = toPredicate((StatementPattern) op); + final IPredicate bop; + if (p != null && BD.SEARCH.equals(p)) { + final Set<StatementPattern> metadata = + searchMetadata.get(sp.getSubjectVar()); + bop = toSearchPredicate(sp, metadata); + } else { + bop = toPredicate((StatementPattern) op); + } sop.setBOp(bop); } catch (UnrecognizedValueException ex) { /* @@ -657,12 +730,6 @@ final Collection<Filter> sesameFilters = new LinkedList<Filter>(); /* - * We need to prune Sesame filters that we cannot translate into native - * constraints (ones that require lexicon joins). - */ - final Collection<SOp> sopsToPrune = new LinkedList<SOp>(); - - /* * Iterate through the sop tree and translate Sesame ValueExpr operators * into bigdata IConstraint boperators. */ @@ -1842,6 +1909,118 @@ } + private IPredicate toSearchPredicate(final StatementPattern sp, + final Set<StatementPattern> metadata) + throws QueryEvaluationException { + + final Value predValue = sp.getPredicateVar().getValue(); + if (log.isDebugEnabled()) { + log.debug(predValue); + } + if (predValue == null || !BD.SEARCH.equals(predValue)) { + throw new IllegalArgumentException("not a valid magic search: " + sp); + } + final Value objValue = sp.getObjectVar().getValue(); + if (log.isDebugEnabled()) { + log.debug(objValue); + } + if (objValue == null || !(objValue instanceof Literal)) { + throw new IllegalArgumentException("not a valid magic search: " + sp); + } + + final Var subjVar = sp.getSubjectVar(); + + final IVariableOrConstant<IV> search = + com.bigdata.bop.Var.var(subjVar.getName()); + + IVariableOrConstant<IV> relevance = new Constant(DummyIV.INSTANCE); + Literal maxHits = null; + Literal minRelevance = null; + + for (StatementPattern meta : metadata) { + if (!meta.getSubjectVar().equals(subjVar)) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + final Value pVal = meta.getPredicateVar().getValue(); + final Var oVar = meta.getObjectVar(); + final Value oVal = oVar.getValue(); + if (pVal == null) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + if (BD.RELEVANCE.equals(pVal)) { + if (oVar.hasValue()) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + relevance = com.bigdata.bop.Var.var(oVar.getName()); + } else if (BD.MAX_HITS.equals(pVal)) { + if (oVal == null || !(oVal instanceof Literal)) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + maxHits = (Literal) oVal; + } else if (BD.MIN_RELEVANCE.equals(pVal)) { + if (oVal == null || !(oVal instanceof Literal)) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + minRelevance = (Literal) oVal; + } + } + + final IAccessPathExpander expander = + new FreeTextSearchExpander(database, (Literal) objValue, + maxHits, minRelevance); + + // Decide on the correct arity for the predicate. + final BOp[] vars = new BOp[] { + search, // s = searchVar + relevance, // p = relevanceVar + new Constant(DummyIV.INSTANCE), // o = reserved + new Constant(DummyIV.INSTANCE), // c = reserved + }; + + // The annotations for the predicate. + final List<NV> anns = new LinkedList<NV>(); + + anns.add(new NV(IPredicate.Annotations.RELATION_NAME, + new String[] { database.getSPORelation().getNamespace() }));// + + // free text search expander or named graphs expander + if (expander != null) + anns.add(new NV(IPredicate.Annotations.ACCESS_PATH_EXPANDER, expander)); + + // timestamp + anns.add(new NV(Annotations.TIMESTAMP, database + .getSPORelation().getTimestamp())); + + /* + * Explicitly set the access path / iterator flags. + * + * Note: High level query generally permits iterator level parallelism. + * We set the PARALLEL flag here so it can be used if a global index + * view is chosen for the access path. + * + * Note: High level query for SPARQL always uses read-only access paths. + * If you are working with a SPARQL extension with UPDATE or INSERT INTO + * semantics then you will need to remote the READONLY flag for the + * mutable access paths. + */ + anns.add(new NV(IPredicate.Annotations.FLAGS, IRangeQuery.DEFAULT + | IRangeQuery.PARALLEL | IRangeQuery.READONLY)); + + return new SPOPredicate(vars, anns.toArray(new NV[anns.size()])); +// return new SPOPredicate( +// new String[] { database.getSPORelation().getNamespace() }, +// -1, // partitionId +// search, // s = searchVar +// relevance, // p = relevanceVar +// new Constant(DummyIV.INSTANCE), // o = reserved +// new Constant(DummyIV.INSTANCE), // c = reserved +// false, // optional +// null, // filter on elements visited by the access path. +// expander // free text search expander or named graphs expander +// ); + + } + /** * Takes a ValueExpression from a sesame Filter or LeftJoin and turns it * into a bigdata {@link IConstraint}. Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-01-22 21:13:16 UTC (rev 4163) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-01-25 16:01:15 UTC (rev 4164) @@ -16,6 +16,7 @@ import com.bigdata.rdf.internal.TermId; import com.bigdata.rdf.internal.VTE; import com.bigdata.rdf.internal.XSDDoubleIV; +import com.bigdata.rdf.lexicon.ITextIndexer; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.spo.ISPO; import com.bigdata.rdf.spo.SPO; @@ -138,14 +139,28 @@ if (hiterator == null) { assert database!=null; assert query != null; - if (database.getLexiconRelation().getSearchEngine() == null) + + final ITextIndexer textNdx = + database.getLexiconRelation().getSearchEngine(); + + if (textNdx == null) throw new UnsupportedOperationException( "No free text index?"); + // final long begin = System.nanoTime(); - hiterator = database.getLexiconRelation() - .getSearchEngine().search(query.getLabel(), + + String s = query.getLabel(); + final boolean prefixMatch; + if (s.indexOf('*') >= 0) { + prefixMatch = true; + s = s.replaceAll("\\*", ""); + } else { + prefixMatch = false; + } + + hiterator = textNdx.search(s, query.getLanguage(), - false/* prefixMatch */, + prefixMatch, minRelevance == null ? 0d : minRelevance.doubleValue()/* minCosine */, maxHits == null ? 10000 : maxHits.intValue()+1/* maxRank */, 1000L/* timeout */, TimeUnit.MILLISECONDS); Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-01-22 21:13:16 UTC (rev 4163) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-01-25 16:01:15 UTC (rev 4164) @@ -934,6 +934,138 @@ } + { // exact match + + final String searchQuery = "brown cow"; + final double minRelevance = 0.0d; + + final String query = + "select ?s ?o ?score " + + "where " + + "{ " + + " ?s <"+RDFS.LABEL+"> ?o . " + + " ?o <"+BD.SEARCH+"> \""+searchQuery+"\" . " + + " ?o <"+BD.RELEVANCE+"> ?score . " + +// " ?o <"+BD.MIN_RELEVANCE+"> \""+minRelevance+"\" . " + +// " ?o <"+BD.MAX_HITS+"> \"5\" . " + + " filter regex(?o, \""+searchQuery+"\") " + + "} " + + "order by desc(?score)"; + + log.info("\n"+query); + + final TupleQuery tupleQuery = + cxn.prepareTupleQuery(QueryLanguage.SPARQL, query); + tupleQuery.setIncludeInferred(true /* includeInferred */); + TupleQueryResult result = tupleQuery.evaluate(); + + int i = 0; + while (result.hasNext()) { + log.info(i++ + ": " + result.next().toString()); + } + assertTrue("wrong # of results: " + i, i == 2); + + result = tupleQuery.evaluate(); + + Collection<BindingSet> answer = new LinkedList<BindingSet>(); + + final ITextIndexer search = + sail.getDatabase().getLexiconRelation().getSearchEngine(); + final Hiterator<IHit> hits = + search.search(searchQuery, + null, // languageCode + false, // prefixMatch + minRelevance, // minCosine + 10000, // maxRank (=maxResults + 1) + 1000L, // timeout + TimeUnit.MILLISECONDS // unit + ); + + while (hits.hasNext()) { + final IHit hit = hits.next(); + final IV id = new TermId(VTE.LITERAL, hit.getDocId()); + final Literal score = vf.createLiteral(hit.getCosine()); + final URI s = uris.get(id); + final Literal o = literals.get(id); + if (!o.getLabel().contains(searchQuery)) + continue; + final BindingSet bs = createBindingSet( + new BindingImpl("s", s), + new BindingImpl("o", o), + new BindingImpl("score", score)); + log.info(bs); + answer.add(bs); + } + + compare(result, answer); + + } + + { // prefix match + + final String searchQuery = "bro*"; + final double minRelevance = 0.0d; + + final String query = + "select ?s ?o ?score " + + "where " + + "{ " + + " ?s <"+RDFS.LABEL+"> ?o . " + + " ?o <"+BD.SEARCH+"> \""+searchQuery+"\" . " + + " ?o <"+BD.RELEVANCE+"> ?score . " + +// " ?o <"+BD.MIN_RELEVANCE+"> \""+minRelevance+"\" . " + +// " ?o <"+BD.MAX_HITS+"> \"5\" . " + +// " filter regex(?o, \""+searchQuery+"\") " + + "} " + + "order by desc(?score)"; + + log.info("\n"+query); + + final TupleQuery tupleQuery = + cxn.prepareTupleQuery(QueryLanguage.SPARQL, query); + tupleQuery.setIncludeInferred(true /* includeInferred */); + TupleQueryResult result = tupleQuery.evaluate(); + + int i = 0; + while (result.hasNext()) { + log.info(i++ + ": " + result.next().toString()); + } + assertTrue("wrong # of results: " + i, i == 3); + + result = tupleQuery.evaluate(); + + Collection<BindingSet> answer = new LinkedList<BindingSet>(); + + final ITextIndexer search = + sail.getDatabase().getLexiconRelation().getSearchEngine(); + final Hiterator<IHit> hits = + search.search(searchQuery, + null, // languageCode + true, // prefixMatch + minRelevance, // minCosine + 10000, // maxRank (=maxResults + 1) + 1000L, // timeout + TimeUnit.MILLISECONDS // unit + ); + + while (hits.hasNext()) { + final IHit hit = hits.next(); + final IV id = new TermId(VTE.LITERAL, hit.getDocId()); + final Literal score = vf.createLiteral(hit.getCosine()); + final URI s = uris.get(id); + final Literal o = literals.get(id); + final BindingSet bs = createBindingSet( + new BindingImpl("s", s), + new BindingImpl("o", o), + new BindingImpl("score", score)); + log.info(bs); + answer.add(bs); + } + + compare(result, answer); + + } + } finally { cxn.close(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |