From: <mrp...@us...> - 2011-05-03 16:42:29
|
Revision: 4441 http://bigdata.svn.sourceforge.net/bigdata/?rev=4441&view=rev Author: mrpersonick Date: 2011-05-03 16:42:23 +0000 (Tue, 03 May 2011) Log Message: ----------- added a maxRelevance option to free text search Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -910,6 +910,7 @@ languageCode,// prefixMatch,// .4, // minCosine + 1.0d, // maxCosine 10000, // maxRank false, // matchAllTerms this.timeout,// @@ -940,8 +941,8 @@ public Hiterator search(final String query, final String languageCode, final double minCosine, final int maxRank) { - return search(query, languageCode, false/* prefixMatch */, minCosine, - maxRank, false, this.timeout, TimeUnit.MILLISECONDS); + return search(query, languageCode, false/* prefixMatch */, minCosine, + 1.0d, maxRank, false, this.timeout, TimeUnit.MILLISECONDS); } @@ -979,6 +980,8 @@ * ). * @param minCosine * The minimum cosine that will be returned. + * @param maxCosine + * The maximum cosine that will be returned. * @param maxRank * The upper bound on the #of hits in the result set. * @param prefixMatch @@ -1014,7 +1017,8 @@ * terms are visited only when they occur in the matching field(s). */ public Hiterator<Hit> search(final String query, final String languageCode, - final boolean prefixMatch, final double minCosine, + final boolean prefixMatch, + final double minCosine, final double maxCosine, final int maxRank, final boolean matchAllTerms, long timeout, final TimeUnit unit) { @@ -1029,6 +1033,9 @@ if (minCosine < 0d || minCosine > 1d) throw new IllegalArgumentException(); + if (maxCosine < 0d || maxCosine > 1d) + throw new IllegalArgumentException(); + if (maxRank <= 0) throw new IllegalArgumentException(); @@ -1130,7 +1137,12 @@ } + /* + * If match all is specified, remove any hits with a term count less + * than the number of search tokens. + */ if (matchAllTerms) { + final int nterms = qdata.terms.size(); if (log.isInfoEnabled()) @@ -1144,6 +1156,7 @@ if (hit.getTermCount() != nterms) it.remove(); } + } // #of hits. @@ -1170,10 +1183,40 @@ if (log.isInfoEnabled()) log.info("Rank ordering "+nhits+" hits by relevance"); - final Hit[] a = hits.values().toArray(new Hit[nhits]); + Hit[] a = hits.values().toArray(new Hit[nhits]); Arrays.sort(a); + /* + * If maxCosine is specified, prune the hits that are above the max + */ + if (maxCosine < 1.0d) { + + // find the first occurrence of a hit that is <= maxCosine + int i = 0; + for (Hit h : a) { + if (h.getCosine() <= maxCosine) + break; + i++; + } + + // no hits with relevance less than maxCosine + if (i == a.length) { + + a = new Hit[0]; + + } else { + + // copy the hits from that first occurrence to the end + final Hit[] tmp = new Hit[a.length - i]; + System.arraycopy(a, i, tmp, 0, tmp.length); + + a = tmp; + + } + + } + final long elapsed = System.currentTimeMillis() - begin; if (log.isInfoEnabled()) Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -98,6 +98,9 @@ * <code>false</code>, only exact matches will be made. * @param minCosine * The minimum cosine that will be returned. + * @param maxCosine + * The maximum cosine that will be returned. Useful for + * evaluating in relevance ranges. * @param maxRank * The upper bound on the #of hits in the result set. * @param matchAllTerms @@ -111,7 +114,8 @@ * @return The result set. */ public Hiterator<A> search(final String query, final String languageCode, - final boolean prefixMatch, final double minCosine, + final boolean prefixMatch, + final double minCosine, final double maxCosine, final int maxRank, final boolean matchAllTerms, long timeout, final TimeUnit unit); Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/store/BD.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -193,6 +193,22 @@ * select ?s * where { * ?s bd:search "scale-out RDF triplestore" . + * ?s bd:maxRelevance "0.9"^^xsd:double . + * } + * + * </pre> + */ + final URI MAX_RELEVANCE = new URIImpl(SEARCH_NAMESPACE+"maxRelevance"); + + /** + * Magic predicate used to query for free text search metadata. Use + * in conjunction with {@link #SEARCH} as follows: + * <p> + * <pre> + * + * select ?s + * where { + * ?s bd:search "scale-out RDF triplestore" . * ?s bd:matchAllTerms "true" . * } * Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -132,7 +132,8 @@ final float minCosine, final BigdataValue[] expected) { final Hiterator hitr = store.getLexiconRelation().getSearchEngine() - .search(query, languageCode, false/* prefixMatch */, minCosine, + .search(query, languageCode, false/* prefixMatch */, + minCosine, 1.0d/* maxCosine */, Integer.MAX_VALUE/* maxRank */, false/* matchAllTerms */, 2L/* timeout */, Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -666,6 +666,7 @@ if (s == null && p != null && (BD.RELEVANCE.equals(p) || BD.MAX_HITS.equals(p) || BD.MIN_RELEVANCE.equals(p) || + BD.MAX_RELEVANCE.equals(p) || BD.MATCH_ALL_TERMS.equals(p))) { final Var sVar = sp.getSubjectVar(); Set<StatementPattern> metadata = searchMetadata.get(sVar); @@ -1653,6 +1654,7 @@ IVariableOrConstant<IV> relevance = new Constant(DummyIV.INSTANCE); Literal maxHits = null; Literal minRelevance = null; + Literal maxRelevance = null; boolean matchAllTerms = false; for (StatementPattern meta : metadata) { @@ -1680,6 +1682,11 @@ throw new IllegalArgumentException("illegal metadata: " + meta); } minRelevance = (Literal) oVal; + } else if (BD.MAX_RELEVANCE.equals(pVal)) { + if (oVal == null || !(oVal instanceof Literal)) { + throw new IllegalArgumentException("illegal metadata: " + meta); + } + maxRelevance = (Literal) oVal; } else if (BD.MATCH_ALL_TERMS.equals(pVal)) { if (oVal == null || !(oVal instanceof Literal)) { throw new IllegalArgumentException("illegal metadata: " + meta); @@ -1690,7 +1697,7 @@ final IAccessPathExpander expander = new FreeTextSearchExpander(database, (Literal) objValue, - maxHits, minRelevance, matchAllTerms); + maxHits, minRelevance, maxRelevance, matchAllTerms); // Decide on the correct arity for the predicate. final BOp[] vars = new BOp[] { @@ -2067,7 +2074,8 @@ .getSearchEngine().search(label, languageCode, false/* prefixMatch */, 0d/* minCosine */, - 10000/* maxRank */, + 1.0d/* maxCosine */, + Integer.MAX_VALUE/* maxRank */, false/* matchAllTerms */, 0L/* timeout */, TimeUnit.MILLISECONDS); Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -59,7 +59,7 @@ */ private final AbstractTripleStore database; - private final Literal query, maxHits, minRelevance; + private final Literal query, maxHits, minRelevance, maxRelevance; private final boolean matchAllTerms; @@ -69,13 +69,14 @@ public FreeTextSearchExpander(final AbstractTripleStore database, final Literal query) { - this(database, query, null, null, false); + this(database, query, null, null, null, false); } public FreeTextSearchExpander(final AbstractTripleStore database, final Literal query, final Literal maxHits, - final Literal minRelevance, final boolean matchAllTerms) { + final Literal minRelevance, final Literal maxRelevance, + final boolean matchAllTerms) { if (database == null) throw new IllegalArgumentException(); @@ -91,6 +92,8 @@ this.minRelevance = minRelevance; + this.maxRelevance = maxRelevance; + this.matchAllTerms = matchAllTerms; } @@ -179,7 +182,8 @@ query.getLanguage(), prefixMatch, minRelevance == null ? 0d : minRelevance.doubleValue()/* minCosine */, - maxHits == null ? 10000 : maxHits.intValue()+1/* maxRank */, + maxRelevance == null ? 1.0d : maxRelevance.doubleValue()/* maxCosine */, + maxHits == null ? Integer.MAX_VALUE : maxHits.intValue()+1/* maxRank */, matchAllTerms, 0L/* timeout */, TimeUnit.MILLISECONDS); } Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-05-03 14:48:13 UTC (rev 4440) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-05-03 16:42:23 UTC (rev 4441) @@ -797,6 +797,7 @@ null, // languageCode false, // prefixMatch 0d, // minCosine + 1.0d, // maxCosine 10000, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -859,6 +860,7 @@ null, // languageCode false, // prefixMatch 0d, // minCosine + 1.0d, // maxCosine maxHits+1, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -886,6 +888,7 @@ { final String searchQuery = "how now brown cow"; final double minRelevance = 0.6d; + final double maxRelevance = 0.9d; final String query = "select ?s ?o ?score " + @@ -895,6 +898,7 @@ " ?o <"+BD.SEARCH+"> \""+searchQuery+"\" . " + " ?o <"+BD.RELEVANCE+"> ?score . " + " ?o <"+BD.MIN_RELEVANCE+"> \""+minRelevance+"\" . " + + " ?o <"+BD.MAX_RELEVANCE+"> \""+maxRelevance+"\" . " + // " ?o <"+BD.MAX_HITS+"> \"5\" . " + "} " + "order by desc(?score)"; @@ -908,7 +912,7 @@ while (result.hasNext()) { System.err.println(i++ + ": " + result.next().toString()); } - assertTrue("wrong # of results", i == 3); + assertTrue("wrong # of results", i == 2); result = tupleQuery.evaluate(); @@ -921,6 +925,7 @@ null, // languageCode false, // prefixMatch minRelevance, // minCosine + maxRelevance, // maxCosine 10000, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -949,6 +954,7 @@ final String searchQuery = "brown cow"; final double minRelevance = 0.0d; + final double maxRelevance = 1.0d; final String query = "select ?s ?o ?score " + @@ -987,6 +993,7 @@ null, // languageCode false, // prefixMatch minRelevance, // minCosine + maxRelevance, // maxCosine 10000, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -1017,6 +1024,7 @@ final String searchQuery = "bro*"; final double minRelevance = 0.0d; + final double maxRelevance = 1.0d; final String query = "select ?s ?o ?score " + @@ -1055,6 +1063,7 @@ null, // languageCode true, // prefixMatch minRelevance, // minCosine + maxRelevance, // maxCosine 10000, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -1083,6 +1092,7 @@ final String searchQuery = "to*"; final double minRelevance = 0.0d; + final double maxRelevance = 1.0d; final String query = "select ?s ?o ?score " + @@ -1121,6 +1131,7 @@ null, // languageCode true, // prefixMatch minRelevance, // minCosine + maxRelevance, // maxCosine 10000, // maxRank (=maxResults + 1) false, // matchAllTerms 1000L, // timeout @@ -1149,6 +1160,7 @@ final String searchQuery = "how now brown cow"; final double minRelevance = 0.0d; + final double maxRelevance = 1.0d; final String query = "select ?s ?o " + @@ -1183,6 +1195,7 @@ null, // languageCode true, // prefixMatch minRelevance, // minCosine + maxRelevance, // maxCosine 10000, // maxRank (=maxResults + 1) true, // matchAllTerms 1000L, // timeout This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |