|
From: <mrp...@us...> - 2014-04-02 16:13:06
|
Revision: 8028
http://sourceforge.net/p/bigdata/code/8028
Author: mrpersonick
Date: 2014-04-02 16:13:03 +0000 (Wed, 02 Apr 2014)
Log Message:
-----------
fixing ticket 872 - added a magic predicate to full text search for range count
Modified Paths:
--------------
branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java
branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java
branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java
branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java
Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2014-04-02 16:13:03 UTC (rev 8028)
@@ -955,35 +955,137 @@
}
+ /**
+ * Perform a range count on a full text query.
+ */
public int count(final FullTextQuery query) {
- final Hit[] a = _search(query);
+ if (cache.containsKey(query)) {
+
+ if (log.isInfoEnabled())
+ log.info("found hits in cache");
+
+ return cache.get(query).length;
+
+ } else {
+
+ if (log.isInfoEnabled())
+ log.info("did not find hits in cache");
+
+ }
+
+ // tokenize the query.
+ final TermFrequencyData<V> qdata = tokenize(query);
+
+ // No terms after stopword extraction
+ if (qdata == null) {
+
+ cache.put(query, new Hit[] {});
+
+ return 0;
+
+ }
+
+ /*
+ * We can run an optimized version of this (just a quick range count)
+ * but only if the caller does not care about exact match and has
+ * not specified a regex.
+ */
+ if (qdata.distinctTermCount() == 1 &&
+ !query.isMatchExact() && query.getMatchRegex() == null) {
+
+ final boolean prefixMatch = query.isPrefixMatch();
+
+ final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
+
+ final String termText = e.getKey();
+
+ final ITermMetadata md = e.getValue();
+
+ final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1,
+ prefixMatch, md.getLocalTermWeight(), this);
+
+ return (int) task1.getRangeCount();
+
+ } else {
+
+ final Hit<V>[] a = _search(query);
+
+ return a.length;
+
+ }
- return a.length;
-
}
- public Hit<V>[] _search(final FullTextQuery q) {
+ protected TermFrequencyData<V> tokenize(final FullTextQuery query) {
- final String query = q.getQuery();
- final String languageCode = q.getLanguageCode();
- final boolean prefixMatch = q.isPrefixMatch();
- final double minCosine = q.getMinCosine();
- final double maxCosine = q.getMaxCosine();
- final int minRank = q.getMinRank();
- final int maxRank = q.getMaxRank();
- final boolean matchAllTerms = q.isMatchAllTerms();
- final boolean matchExact = q.isMatchExact();
- final String regex = q.getMatchRegex();
- long timeout = q.getTimeout();
- final TimeUnit unit = q.getTimeUnit();
+ final String q = query.getQuery();
+ final String languageCode = query.getLanguageCode();
+ final boolean prefixMatch = query.isPrefixMatch();
+ // tokenize the query.
+ final TermFrequencyData<V> qdata;
+ {
+
+ final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this);
+
+ /*
+ * If we are using prefix match ('*' operator) then we don't want to
+ * filter stopwords from the search query.
+ */
+ final boolean filterStopwords = !prefixMatch;
+
+ index(buffer, //
+ null, // docId // was Long.MIN_VALUE
+ Integer.MIN_VALUE, // fieldId
+ languageCode,//
+ new StringReader(q), //
+ filterStopwords//
+ );
+
+ if (buffer.size() == 0) {
+
+ /*
+ * There were no terms after stopword extration.
+ */
+
+ log.warn("No terms after stopword extraction: query=" + query);
+
+ return null;
+
+ }
+
+ qdata = buffer.get(0);
+
+ qdata.normalize();
+
+ }
+
+ return qdata;
+
+ }
+
+ public Hit<V>[] _search(final FullTextQuery query) {
+
+ final String queryStr = query.getQuery();
+ final String languageCode = query.getLanguageCode();
+ final boolean prefixMatch = query.isPrefixMatch();
+ final double minCosine = query.getMinCosine();
+ final double maxCosine = query.getMaxCosine();
+ final int minRank = query.getMinRank();
+ final int maxRank = query.getMaxRank();
+ final boolean matchAllTerms = query.isMatchAllTerms();
+ final boolean matchExact = query.isMatchExact();
+ final String regex = query.getMatchRegex();
+ long timeout = query.getTimeout();
+ final TimeUnit unit = query.getTimeUnit();
+
final long begin = System.currentTimeMillis();
// if (languageCode == null)
// throw new IllegalArgumentException();
- if (query == null)
+ if (queryStr == null)
throw new IllegalArgumentException();
if (minCosine < 0d || minCosine > 1d)
@@ -1002,7 +1104,7 @@
throw new IllegalArgumentException();
if (log.isInfoEnabled())
- log.info("languageCode=[" + languageCode + "], text=[" + query
+ log.info("languageCode=[" + languageCode + "], text=[" + queryStr
+ "], minCosine=" + minCosine
+ ", maxCosine=" + maxCosine
+ ", minRank=" + minRank
@@ -1018,7 +1120,7 @@
}
- final FullTextQuery cacheKey = q;
+ final FullTextQuery cacheKey = query;
Hit<V>[] a;
@@ -1034,145 +1136,24 @@
if (log.isInfoEnabled())
log.info("did not find hits in cache");
- // tokenize the query.
- final TermFrequencyData<V> qdata;
- {
-
- final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this);
-
- /*
- * If we are using prefix match ('*' operator) then we don't want to
- * filter stopwords from the search query.
- */
- final boolean filterStopwords = !prefixMatch;
-
- index(buffer, //
- null, // docId // was Long.MIN_VALUE
- Integer.MIN_VALUE, // fieldId
- languageCode,//
- new StringReader(query), //
- filterStopwords//
- );
-
- if (buffer.size() == 0) {
-
- /*
- * There were no terms after stopword extration.
- */
-
- log.warn("No terms after stopword extraction: query=" + query);
-
- a = new Hit[] {};
-
- cache.put(cacheKey, a);
-
- return a;
-
- }
-
- qdata = buffer.get(0);
-
- qdata.normalize();
-
- }
-
- final IHitCollector<V> hits;
-
- if (qdata.distinctTermCount() == 1) {
-
- final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
-
- final String termText = e.getKey();
+ // tokenize the query.
+ final TermFrequencyData<V> qdata = tokenize(query);
+
+ // No terms after stopword extraction
+ if (qdata == null) {
- final ITermMetadata md = e.getValue();
-
- final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, prefixMatch, md
- .getLocalTermWeight(), this);
-
- hits = new SingleTokenHitCollector<V>(task1);
-
- } else {
-
- final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>(
- qdata.distinctTermCount());
-
- int i = 0;
- for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
-
- final String termText = e.getKey();
-
- final ITermMetadata md = e.getValue();
-
- tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), prefixMatch, md
- .getLocalTermWeight(), this));
-
- }
-
- hits = new MultiTokenHitCollector<V>(tasks);
-
- }
-
- // run the queries.
- {
-
- final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>(
- qdata.distinctTermCount());
-
- int i = 0;
- for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
-
- final String termText = e.getKey();
-
- final ITermMetadata md = e.getValue();
-
- tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(),
- prefixMatch, md.getLocalTermWeight(), this, hits));
-
- }
-
- final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>(
- getExecutorService(), timeout, unit);
-
- try {
-
- final long start = System.currentTimeMillis();
-
- executionHelper.submitTasks(tasks);
-
- if (log.isInfoEnabled()) {
- final long readTime = System.currentTimeMillis() - start;
- log.info("read time: " + readTime);
- }
-
- } catch (InterruptedException ex) {
-
- if (log.isInfoEnabled()) {
- // TODO Should we wrap and toss this interrupt instead?
- log.info("Interrupted - only partial results will be returned.");
- }
-
- /*
- * Yes, let's toss it. We were getting into a situation
- * where the ExecutionHelper above received an interrupt
- * but we still went through the heavy-weight filtering
- * operations below (matchExact or matchRegex).
- */
- throw new RuntimeException(ex);
-
- } catch (ExecutionException ex) {
-
- throw new RuntimeException(ex);
-
- }
-
- }
-
- a = hits.getHits();
-
+ cache.put(cacheKey, a = new Hit[] {});
+
+ return a;
+
+ }
+
+ a = executeQuery(qdata, prefixMatch, timeout, unit);
+
if (a.length == 0) {
log.info("No hits: languageCode=[" + languageCode + "], query=["
- + query + "]");
+ + queryStr + "]");
cache.put(cacheKey, a);
@@ -1223,14 +1204,14 @@
*/
if (matchExact) {
- a = matchExact(a, query);
+ a = matchExact(a, queryStr);
}
if (a.length == 0) {
log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=["
- + query + "]");
+ + queryStr + "]");
cache.put(cacheKey, a);
@@ -1260,7 +1241,7 @@
if (a.length == 0) {
log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=["
- + query + "], regex=[" + regex + "]");
+ + queryStr + "], regex=[" + regex + "]");
cache.put(cacheKey, a);
@@ -1299,6 +1280,27 @@
}
+ /*
+ * Take a slice of the hits based on min/max cosine and min/max rank.
+ */
+ a = slice(query, a);
+
+ final long elapsed = System.currentTimeMillis() - begin;
+
+ if (log.isInfoEnabled())
+ log.info("Done: " + a.length + " hits in " + elapsed + "ms");
+
+ return a;
+
+ }
+
+ protected Hit<V>[] slice(final FullTextQuery query, Hit<V>[] a) {
+
+ final double minCosine = query.getMinCosine();
+ final double maxCosine = query.getMaxCosine();
+ final int minRank = query.getMinRank();
+ final int maxRank = query.getMaxRank();
+
// if (log.isDebugEnabled()) {
// log.debug("before min/max cosine/rank pruning:");
// for (Hit<V> h : a)
@@ -1422,13 +1424,106 @@
}
- final long elapsed = System.currentTimeMillis() - begin;
+ return a;
- if (log.isInfoEnabled())
- log.info("Done: " + a.length + " hits in " + elapsed + "ms");
+ }
+
+ protected Hit<V>[] executeQuery(final TermFrequencyData<V> qdata,
+ final boolean prefixMatch, final long timeout, final TimeUnit unit) {
+
+ final IHitCollector<V> hits;
+
+ if (qdata.distinctTermCount() == 1) {
+
+ final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
+
+ final String termText = e.getKey();
+
+ final ITermMetadata md = e.getValue();
- return a;
+ final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1,
+ prefixMatch, md.getLocalTermWeight(), this);
+
+ hits = new SingleTokenHitCollector<V>(task1);
+
+ } else {
+
+ final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>(
+ qdata.distinctTermCount());
+
+ int i = 0;
+ for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
+
+ final String termText = e.getKey();
+
+ final ITermMetadata md = e.getValue();
+
+ tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(),
+ prefixMatch, md.getLocalTermWeight(), this));
+
+ }
+
+ hits = new MultiTokenHitCollector<V>(tasks);
+
+ }
+ // run the queries.
+ {
+
+ final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>(
+ qdata.distinctTermCount());
+
+ int i = 0;
+ for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
+
+ final String termText = e.getKey();
+
+ final ITermMetadata md = e.getValue();
+
+ tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(),
+ prefixMatch, md.getLocalTermWeight(), this, hits));
+
+ }
+
+ final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>(
+ getExecutorService(), timeout, unit);
+
+ try {
+
+ final long start = System.currentTimeMillis();
+
+ executionHelper.submitTasks(tasks);
+
+ if (log.isInfoEnabled()) {
+ final long readTime = System.currentTimeMillis() - start;
+ log.info("read time: " + readTime);
+ }
+
+ } catch (InterruptedException ex) {
+
+ if (log.isInfoEnabled()) {
+ // TODO Should we wrap and toss this interrupt instead?
+ log.info("Interrupted - only partial results will be returned.");
+ }
+
+ /*
+ * Yes, let's toss it. We were getting into a situation
+ * where the ExecutionHelper above received an interrupt
+ * but we still went through the heavy-weight filtering
+ * operations below (matchExact or matchRegex).
+ */
+ throw new RuntimeException(ex);
+
+ } catch (ExecutionException ex) {
+
+ throw new RuntimeException(ex);
+
+ }
+
+ }
+
+ return hits.getHits();
+
}
/**
Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java 2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java 2014-04-02 16:13:03 UTC (rev 8028)
@@ -108,6 +108,7 @@
set.add(BDS.SUBJECT_SEARCH);
set.add(BDS.SEARCH_TIMEOUT);
set.add(BDS.MATCH_REGEX);
+ set.add(BDS.RANGE_COUNT);
searchUris = Collections.unmodifiableSet(set);
Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java 2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java 2014-04-02 16:13:03 UTC (rev 8028)
@@ -69,6 +69,7 @@
import com.bigdata.rdf.store.BDS;
import com.bigdata.search.Hiterator;
import com.bigdata.search.IHit;
+import com.bigdata.striterator.ChunkedArrayIterator;
import cutthecrap.utils.striterators.ICloseableIterator;
@@ -300,6 +301,10 @@
assertObjectIsLiteral(sp);
+ } else if (uri.equals(BDS.RANGE_COUNT)) {
+
+ assertObjectIsVariable(sp);
+
} else if(uri.equals(BDS.MATCH_REGEX)) {
// a variable for the object is equivalent to regex = null
@@ -367,6 +372,7 @@
private final boolean subjectSearch;
private final Literal searchTimeout;
private final Literal matchRegex;
+ private final IVariable<?> rangeCountVar;
public SearchCall(
final AbstractTripleStore store,
@@ -415,6 +421,7 @@
IVariable<?> relVar = null;
IVariable<?> rankVar = null;
+ IVariable<?> rangeCountVar = null;
Literal minRank = null;
Literal maxRank = null;
Literal minRelevance = null;
@@ -439,6 +446,8 @@
relVar = oVar;
} else if (BDS.RANK.equals(p)) {
rankVar = oVar;
+ } else if (BDS.RANGE_COUNT.equals(p)) {
+ rangeCountVar = oVar;
} else if (BDS.MIN_RANK.equals(p)) {
minRank = (Literal) oVal;
} else if (BDS.MAX_RANK.equals(p)) {
@@ -484,6 +493,7 @@
this.subjectSearch = subjectSearch;
this.searchTimeout = searchTimeout;
this.matchRegex = matchRegex;
+ this.rangeCountVar = rangeCountVar;
}
@@ -527,6 +537,46 @@
}
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ private int getRangeCount() {
+
+// final IValueCentricTextIndexer<IHit> textIndex = (IValueCentricTextIndexer) store
+// .getLexiconRelation().getSearchEngine();
+
+ final ITextIndexer<IHit> textIndex = (ITextIndexer)
+ (this.subjectSearch ?
+ store.getLexiconRelation().getSubjectCentricSearchEngine() :
+ store.getLexiconRelation().getSearchEngine());
+
+ if (textIndex == null)
+ throw new UnsupportedOperationException("No free text index?");
+
+ String s = query.getLabel();
+ final boolean prefixMatch;
+ if (s.indexOf('*') >= 0) {
+ prefixMatch = true;
+ s = s.replaceAll("\\*", "");
+ } else {
+ prefixMatch = false;
+ }
+
+ return textIndex.count(new FullTextQuery(
+ s,//
+ query.getLanguage(),//
+ prefixMatch,//
+ matchRegex == null ? null : matchRegex.stringValue(),
+ matchAllTerms,
+ matchExact,
+ minRelevance == null ? BDS.DEFAULT_MIN_RELEVANCE : minRelevance.doubleValue()/* minCosine */,
+ maxRelevance == null ? BDS.DEFAULT_MAX_RELEVANCE : maxRelevance.doubleValue()/* maxCosine */,
+ minRank == null ? BDS.DEFAULT_MIN_RANK/*1*/ : minRank.intValue()/* minRank */,
+ maxRank == null ? BDS.DEFAULT_MAX_RANK/*Integer.MAX_VALUE*/ : maxRank.intValue()/* maxRank */,
+ searchTimeout == null ? BDS.DEFAULT_TIMEOUT/*0L*/ : searchTimeout.longValue()/* timeout */,
+ TimeUnit.MILLISECONDS
+ ));
+
+ }
+
/**
* {@inheritDoc}
*
@@ -561,7 +611,24 @@
}
- return new HitConverter(getHiterator());
+ if (rangeCountVar != null) {
+
+ final int i = getRangeCount();
+
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ final ListBindingSet bs = new ListBindingSet(
+ new IVariable[] { rangeCountVar },
+ new IConstant[] { new Constant(new XSDNumericIV(i)) });
+
+ return new ChunkedArrayIterator<IBindingSet>(new IBindingSet[] {
+ bs
+ });
+
+ } else {
+
+ return new HitConverter(getHiterator());
+
+ }
}
@@ -631,11 +698,11 @@
final ListBindingSet bs = new ListBindingSet(vars, vals);
- if (log.isInfoEnabled()) {
- log.info(bs);
- log.info(query.getClass());
- log.info(((BigdataLiteral) query).getIV());
- log.info(((BigdataLiteral) query).getIV().getClass());
+ if (log.isTraceEnabled()) {
+ log.trace(bs);
+ log.trace(query.getClass());
+ log.trace(((BigdataLiteral) query).getIV());
+ log.trace(((BigdataLiteral) query).getIV().getClass());
}
return bs;
Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java 2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java 2014-04-02 16:13:03 UTC (rev 8028)
@@ -420,5 +420,14 @@
* The default timeout for a free text search (milliseconds).
*/
final long DEFAULT_TIMEOUT = Long.MAX_VALUE;
+
+ /**
+ * Magic predicate to specify that we want a range count done on the search.
+ * Bind the range count to the variable in the object position. Will
+ * attempt to do a fast range count on the index rather than materializing
+ * the hits into an array. This is only possible if matchExact == false
+ * and matchRegex == null.
+ */
+ final URI RANGE_COUNT = new URIImpl(NAMESPACE + "rangeCount");
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|