From: <mrp...@us...> - 2014-04-02 16:13:06
|
Revision: 8028 http://sourceforge.net/p/bigdata/code/8028 Author: mrpersonick Date: 2014-04-02 16:13:03 +0000 (Wed, 02 Apr 2014) Log Message: ----------- fixing ticket 872 - added a magic predicate to full text search for range count Modified Paths: -------------- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2014-04-02 13:14:09 UTC (rev 8027) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2014-04-02 16:13:03 UTC (rev 8028) @@ -955,35 +955,137 @@ } + /** + * Perform a range count on a full text query. + */ public int count(final FullTextQuery query) { - final Hit[] a = _search(query); + if (cache.containsKey(query)) { + + if (log.isInfoEnabled()) + log.info("found hits in cache"); + + return cache.get(query).length; + + } else { + + if (log.isInfoEnabled()) + log.info("did not find hits in cache"); + + } + + // tokenize the query. + final TermFrequencyData<V> qdata = tokenize(query); + + // No terms after stopword extraction + if (qdata == null) { + + cache.put(query, new Hit[] {}); + + return 0; + + } + + /* + * We can run an optimized version of this (just a quick range count) + * but only if the caller does not care about exact match and has + * not specified a regex. + */ + if (qdata.distinctTermCount() == 1 && + !query.isMatchExact() && query.getMatchRegex() == null) { + + final boolean prefixMatch = query.isPrefixMatch(); + + final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry(); + + final String termText = e.getKey(); + + final ITermMetadata md = e.getValue(); + + final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, + prefixMatch, md.getLocalTermWeight(), this); + + return (int) task1.getRangeCount(); + + } else { + + final Hit<V>[] a = _search(query); + + return a.length; + + } - return a.length; - } - public Hit<V>[] _search(final FullTextQuery q) { + protected TermFrequencyData<V> tokenize(final FullTextQuery query) { - final String query = q.getQuery(); - final String languageCode = q.getLanguageCode(); - final boolean prefixMatch = q.isPrefixMatch(); - final double minCosine = q.getMinCosine(); - final double maxCosine = q.getMaxCosine(); - final int minRank = q.getMinRank(); - final int maxRank = q.getMaxRank(); - final boolean matchAllTerms = q.isMatchAllTerms(); - final boolean matchExact = q.isMatchExact(); - final String regex = q.getMatchRegex(); - long timeout = q.getTimeout(); - final TimeUnit unit = q.getTimeUnit(); + final String q = query.getQuery(); + final String languageCode = query.getLanguageCode(); + final boolean prefixMatch = query.isPrefixMatch(); + // tokenize the query. + final TermFrequencyData<V> qdata; + { + + final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this); + + /* + * If we are using prefix match ('*' operator) then we don't want to + * filter stopwords from the search query. + */ + final boolean filterStopwords = !prefixMatch; + + index(buffer, // + null, // docId // was Long.MIN_VALUE + Integer.MIN_VALUE, // fieldId + languageCode,// + new StringReader(q), // + filterStopwords// + ); + + if (buffer.size() == 0) { + + /* + * There were no terms after stopword extration. + */ + + log.warn("No terms after stopword extraction: query=" + query); + + return null; + + } + + qdata = buffer.get(0); + + qdata.normalize(); + + } + + return qdata; + + } + + public Hit<V>[] _search(final FullTextQuery query) { + + final String queryStr = query.getQuery(); + final String languageCode = query.getLanguageCode(); + final boolean prefixMatch = query.isPrefixMatch(); + final double minCosine = query.getMinCosine(); + final double maxCosine = query.getMaxCosine(); + final int minRank = query.getMinRank(); + final int maxRank = query.getMaxRank(); + final boolean matchAllTerms = query.isMatchAllTerms(); + final boolean matchExact = query.isMatchExact(); + final String regex = query.getMatchRegex(); + long timeout = query.getTimeout(); + final TimeUnit unit = query.getTimeUnit(); + final long begin = System.currentTimeMillis(); // if (languageCode == null) // throw new IllegalArgumentException(); - if (query == null) + if (queryStr == null) throw new IllegalArgumentException(); if (minCosine < 0d || minCosine > 1d) @@ -1002,7 +1104,7 @@ throw new IllegalArgumentException(); if (log.isInfoEnabled()) - log.info("languageCode=[" + languageCode + "], text=[" + query + log.info("languageCode=[" + languageCode + "], text=[" + queryStr + "], minCosine=" + minCosine + ", maxCosine=" + maxCosine + ", minRank=" + minRank @@ -1018,7 +1120,7 @@ } - final FullTextQuery cacheKey = q; + final FullTextQuery cacheKey = query; Hit<V>[] a; @@ -1034,145 +1136,24 @@ if (log.isInfoEnabled()) log.info("did not find hits in cache"); - // tokenize the query. - final TermFrequencyData<V> qdata; - { - - final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this); - - /* - * If we are using prefix match ('*' operator) then we don't want to - * filter stopwords from the search query. - */ - final boolean filterStopwords = !prefixMatch; - - index(buffer, // - null, // docId // was Long.MIN_VALUE - Integer.MIN_VALUE, // fieldId - languageCode,// - new StringReader(query), // - filterStopwords// - ); - - if (buffer.size() == 0) { - - /* - * There were no terms after stopword extration. - */ - - log.warn("No terms after stopword extraction: query=" + query); - - a = new Hit[] {}; - - cache.put(cacheKey, a); - - return a; - - } - - qdata = buffer.get(0); - - qdata.normalize(); - - } - - final IHitCollector<V> hits; - - if (qdata.distinctTermCount() == 1) { - - final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry(); - - final String termText = e.getKey(); + // tokenize the query. + final TermFrequencyData<V> qdata = tokenize(query); + + // No terms after stopword extraction + if (qdata == null) { - final ITermMetadata md = e.getValue(); - - final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, prefixMatch, md - .getLocalTermWeight(), this); - - hits = new SingleTokenHitCollector<V>(task1); - - } else { - - final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>( - qdata.distinctTermCount()); - - int i = 0; - for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) { - - final String termText = e.getKey(); - - final ITermMetadata md = e.getValue(); - - tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), prefixMatch, md - .getLocalTermWeight(), this)); - - } - - hits = new MultiTokenHitCollector<V>(tasks); - - } - - // run the queries. - { - - final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>( - qdata.distinctTermCount()); - - int i = 0; - for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) { - - final String termText = e.getKey(); - - final ITermMetadata md = e.getValue(); - - tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(), - prefixMatch, md.getLocalTermWeight(), this, hits)); - - } - - final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>( - getExecutorService(), timeout, unit); - - try { - - final long start = System.currentTimeMillis(); - - executionHelper.submitTasks(tasks); - - if (log.isInfoEnabled()) { - final long readTime = System.currentTimeMillis() - start; - log.info("read time: " + readTime); - } - - } catch (InterruptedException ex) { - - if (log.isInfoEnabled()) { - // TODO Should we wrap and toss this interrupt instead? - log.info("Interrupted - only partial results will be returned."); - } - - /* - * Yes, let's toss it. We were getting into a situation - * where the ExecutionHelper above received an interrupt - * but we still went through the heavy-weight filtering - * operations below (matchExact or matchRegex). - */ - throw new RuntimeException(ex); - - } catch (ExecutionException ex) { - - throw new RuntimeException(ex); - - } - - } - - a = hits.getHits(); - + cache.put(cacheKey, a = new Hit[] {}); + + return a; + + } + + a = executeQuery(qdata, prefixMatch, timeout, unit); + if (a.length == 0) { log.info("No hits: languageCode=[" + languageCode + "], query=[" - + query + "]"); + + queryStr + "]"); cache.put(cacheKey, a); @@ -1223,14 +1204,14 @@ */ if (matchExact) { - a = matchExact(a, query); + a = matchExact(a, queryStr); } if (a.length == 0) { log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=[" - + query + "]"); + + queryStr + "]"); cache.put(cacheKey, a); @@ -1260,7 +1241,7 @@ if (a.length == 0) { log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=[" - + query + "], regex=[" + regex + "]"); + + queryStr + "], regex=[" + regex + "]"); cache.put(cacheKey, a); @@ -1299,6 +1280,27 @@ } + /* + * Take a slice of the hits based on min/max cosine and min/max rank. + */ + a = slice(query, a); + + final long elapsed = System.currentTimeMillis() - begin; + + if (log.isInfoEnabled()) + log.info("Done: " + a.length + " hits in " + elapsed + "ms"); + + return a; + + } + + protected Hit<V>[] slice(final FullTextQuery query, Hit<V>[] a) { + + final double minCosine = query.getMinCosine(); + final double maxCosine = query.getMaxCosine(); + final int minRank = query.getMinRank(); + final int maxRank = query.getMaxRank(); + // if (log.isDebugEnabled()) { // log.debug("before min/max cosine/rank pruning:"); // for (Hit<V> h : a) @@ -1422,13 +1424,106 @@ } - final long elapsed = System.currentTimeMillis() - begin; + return a; - if (log.isInfoEnabled()) - log.info("Done: " + a.length + " hits in " + elapsed + "ms"); + } + + protected Hit<V>[] executeQuery(final TermFrequencyData<V> qdata, + final boolean prefixMatch, final long timeout, final TimeUnit unit) { + + final IHitCollector<V> hits; + + if (qdata.distinctTermCount() == 1) { + + final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry(); + + final String termText = e.getKey(); + + final ITermMetadata md = e.getValue(); - return a; + final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, + prefixMatch, md.getLocalTermWeight(), this); + + hits = new SingleTokenHitCollector<V>(task1); + + } else { + + final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>( + qdata.distinctTermCount()); + + int i = 0; + for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) { + + final String termText = e.getKey(); + + final ITermMetadata md = e.getValue(); + + tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), + prefixMatch, md.getLocalTermWeight(), this)); + + } + + hits = new MultiTokenHitCollector<V>(tasks); + + } + // run the queries. + { + + final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>( + qdata.distinctTermCount()); + + int i = 0; + for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) { + + final String termText = e.getKey(); + + final ITermMetadata md = e.getValue(); + + tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(), + prefixMatch, md.getLocalTermWeight(), this, hits)); + + } + + final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>( + getExecutorService(), timeout, unit); + + try { + + final long start = System.currentTimeMillis(); + + executionHelper.submitTasks(tasks); + + if (log.isInfoEnabled()) { + final long readTime = System.currentTimeMillis() - start; + log.info("read time: " + readTime); + } + + } catch (InterruptedException ex) { + + if (log.isInfoEnabled()) { + // TODO Should we wrap and toss this interrupt instead? + log.info("Interrupted - only partial results will be returned."); + } + + /* + * Yes, let's toss it. We were getting into a situation + * where the ExecutionHelper above received an interrupt + * but we still went through the heavy-weight filtering + * operations below (matchExact or matchRegex). + */ + throw new RuntimeException(ex); + + } catch (ExecutionException ex) { + + throw new RuntimeException(ex); + + } + + } + + return hits.getHits(); + } /** Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java 2014-04-02 13:14:09 UTC (rev 8027) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java 2014-04-02 16:13:03 UTC (rev 8028) @@ -108,6 +108,7 @@ set.add(BDS.SUBJECT_SEARCH); set.add(BDS.SEARCH_TIMEOUT); set.add(BDS.MATCH_REGEX); + set.add(BDS.RANGE_COUNT); searchUris = Collections.unmodifiableSet(set); Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java 2014-04-02 13:14:09 UTC (rev 8027) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java 2014-04-02 16:13:03 UTC (rev 8028) @@ -69,6 +69,7 @@ import com.bigdata.rdf.store.BDS; import com.bigdata.search.Hiterator; import com.bigdata.search.IHit; +import com.bigdata.striterator.ChunkedArrayIterator; import cutthecrap.utils.striterators.ICloseableIterator; @@ -300,6 +301,10 @@ assertObjectIsLiteral(sp); + } else if (uri.equals(BDS.RANGE_COUNT)) { + + assertObjectIsVariable(sp); + } else if(uri.equals(BDS.MATCH_REGEX)) { // a variable for the object is equivalent to regex = null @@ -367,6 +372,7 @@ private final boolean subjectSearch; private final Literal searchTimeout; private final Literal matchRegex; + private final IVariable<?> rangeCountVar; public SearchCall( final AbstractTripleStore store, @@ -415,6 +421,7 @@ IVariable<?> relVar = null; IVariable<?> rankVar = null; + IVariable<?> rangeCountVar = null; Literal minRank = null; Literal maxRank = null; Literal minRelevance = null; @@ -439,6 +446,8 @@ relVar = oVar; } else if (BDS.RANK.equals(p)) { rankVar = oVar; + } else if (BDS.RANGE_COUNT.equals(p)) { + rangeCountVar = oVar; } else if (BDS.MIN_RANK.equals(p)) { minRank = (Literal) oVal; } else if (BDS.MAX_RANK.equals(p)) { @@ -484,6 +493,7 @@ this.subjectSearch = subjectSearch; this.searchTimeout = searchTimeout; this.matchRegex = matchRegex; + this.rangeCountVar = rangeCountVar; } @@ -527,6 +537,46 @@ } + @SuppressWarnings({ "rawtypes", "unchecked" }) + private int getRangeCount() { + +// final IValueCentricTextIndexer<IHit> textIndex = (IValueCentricTextIndexer) store +// .getLexiconRelation().getSearchEngine(); + + final ITextIndexer<IHit> textIndex = (ITextIndexer) + (this.subjectSearch ? + store.getLexiconRelation().getSubjectCentricSearchEngine() : + store.getLexiconRelation().getSearchEngine()); + + if (textIndex == null) + throw new UnsupportedOperationException("No free text index?"); + + String s = query.getLabel(); + final boolean prefixMatch; + if (s.indexOf('*') >= 0) { + prefixMatch = true; + s = s.replaceAll("\\*", ""); + } else { + prefixMatch = false; + } + + return textIndex.count(new FullTextQuery( + s,// + query.getLanguage(),// + prefixMatch,// + matchRegex == null ? null : matchRegex.stringValue(), + matchAllTerms, + matchExact, + minRelevance == null ? BDS.DEFAULT_MIN_RELEVANCE : minRelevance.doubleValue()/* minCosine */, + maxRelevance == null ? BDS.DEFAULT_MAX_RELEVANCE : maxRelevance.doubleValue()/* maxCosine */, + minRank == null ? BDS.DEFAULT_MIN_RANK/*1*/ : minRank.intValue()/* minRank */, + maxRank == null ? BDS.DEFAULT_MAX_RANK/*Integer.MAX_VALUE*/ : maxRank.intValue()/* maxRank */, + searchTimeout == null ? BDS.DEFAULT_TIMEOUT/*0L*/ : searchTimeout.longValue()/* timeout */, + TimeUnit.MILLISECONDS + )); + + } + /** * {@inheritDoc} * @@ -561,7 +611,24 @@ } - return new HitConverter(getHiterator()); + if (rangeCountVar != null) { + + final int i = getRangeCount(); + + @SuppressWarnings({ "rawtypes", "unchecked" }) + final ListBindingSet bs = new ListBindingSet( + new IVariable[] { rangeCountVar }, + new IConstant[] { new Constant(new XSDNumericIV(i)) }); + + return new ChunkedArrayIterator<IBindingSet>(new IBindingSet[] { + bs + }); + + } else { + + return new HitConverter(getHiterator()); + + } } @@ -631,11 +698,11 @@ final ListBindingSet bs = new ListBindingSet(vars, vals); - if (log.isInfoEnabled()) { - log.info(bs); - log.info(query.getClass()); - log.info(((BigdataLiteral) query).getIV()); - log.info(((BigdataLiteral) query).getIV().getClass()); + if (log.isTraceEnabled()) { + log.trace(bs); + log.trace(query.getClass()); + log.trace(((BigdataLiteral) query).getIV()); + log.trace(((BigdataLiteral) query).getIV().getClass()); } return bs; Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java =================================================================== --- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java 2014-04-02 13:14:09 UTC (rev 8027) +++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java 2014-04-02 16:13:03 UTC (rev 8028) @@ -420,5 +420,14 @@ * The default timeout for a free text search (milliseconds). */ final long DEFAULT_TIMEOUT = Long.MAX_VALUE; + + /** + * Magic predicate to specify that we want a range count done on the search. + * Bind the range count to the variable in the object position. Will + * attempt to do a fast range count on the index rather than materializing + * the hits into an array. This is only possible if matchExact == false + * and matchRegex == null. + */ + final URI RANGE_COUNT = new URIImpl(NAMESPACE + "rangeCount"); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |