[Bigdata-commit] SF.net SVN: bigdata:[8028] branches/BIGDATA_RELEASE_1_3_0

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 8028
          http://sourceforge.net/p/bigdata/code/8028
Author:   mrpersonick
Date:     2014-04-02 16:13:03 +0000 (Wed, 02 Apr 2014)
Log Message:
-----------
fixing ticket 872 - added a magic predicate to full text search for range count

Modified Paths:
--------------
    branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java
    branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java
    branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java
    branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java

Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java
===================================================================

--- branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2014-04-02 16:13:03 UTC (rev 8028)
@@ -955,35 +955,137 @@
 
     }
     
+    /**
+     * Perform a range count on a full text query.
+     */
     public int count(final FullTextQuery query) {
     	
-		final Hit[] a = _search(query);
+    	if (cache.containsKey(query)) {
+    		
+        	if (log.isInfoEnabled())
+        		log.info("found hits in cache");
+        	
+    		return cache.get(query).length;
+    		
+        } else {
+        	
+        	if (log.isInfoEnabled())
+        		log.info("did not find hits in cache");
+        	
+        }
+    	
+        // tokenize the query.
+        final TermFrequencyData<V> qdata = tokenize(query);
+        
+        // No terms after stopword extraction
+        if (qdata == null) {
+        	
+        	cache.put(query, new Hit[] {});
+        	
+        	return 0;
+        			
+        }
+        
+        /*
+         * We can run an optimized version of this (just a quick range count)
+         * but only if the caller does not care about exact match and has
+         * not specified a regex.
+         */
+        if (qdata.distinctTermCount() == 1 &&
+        		!query.isMatchExact() && query.getMatchRegex() == null) {
+        	
+        	final boolean prefixMatch = query.isPrefixMatch();
+        	
+        	final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
+        	
+            final String termText = e.getKey();
+        	
+            final ITermMetadata md = e.getValue();
+
+            final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, 
+            		prefixMatch, md.getLocalTermWeight(), this);
+            
+            return (int) task1.getRangeCount();
+        	
+        } else {
+	        
+			final Hit<V>[] a = _search(query);
+	    	
+			return a.length;
+			
+        }
 		
-		return a.length;
-		
     }
     
-    public Hit<V>[] _search(final FullTextQuery q) {
+    protected TermFrequencyData<V> tokenize(final FullTextQuery query) {
     	
-		final String query = q.getQuery();
-		final String languageCode = q.getLanguageCode(); 
-		final boolean prefixMatch = q.isPrefixMatch();
-        final double minCosine = q.getMinCosine();
-        final double maxCosine = q.getMaxCosine();
-        final int minRank = q.getMinRank();
-        final int maxRank = q.getMaxRank(); 
-        final boolean matchAllTerms = q.isMatchAllTerms();
-        final boolean matchExact = q.isMatchExact();
-        final String regex = q.getMatchRegex();
-        long timeout = q.getTimeout();
-        final TimeUnit unit = q.getTimeUnit(); 
+		final String q = query.getQuery();
+		final String languageCode = query.getLanguageCode(); 
+		final boolean prefixMatch = query.isPrefixMatch();
 
+        // tokenize the query.
+        final TermFrequencyData<V> qdata;
+        {
+            
+    		final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this);
+
+            /*
+             * If we are using prefix match ('*' operator) then we don't want to
+             * filter stopwords from the search query.
+             */
+            final boolean filterStopwords = !prefixMatch;
+
+            index(buffer, //
+                    null, // docId // was Long.MIN_VALUE
+                    Integer.MIN_VALUE, // fieldId
+                    languageCode,//
+                    new StringReader(q), //
+                    filterStopwords//
+            );
+
+            if (buffer.size() == 0) {
+
+                /*
+                 * There were no terms after stopword extration.
+                 */
+
+                log.warn("No terms after stopword extraction: query=" + query);
+
+                return null; 
+
+            }
+            
+            qdata = buffer.get(0);
+            
+            qdata.normalize();
+            
+        }
+        
+        return qdata;
+
+    }
+    
+    public Hit<V>[] _search(final FullTextQuery query) {
+    	
+		final String queryStr = query.getQuery();
+		final String languageCode = query.getLanguageCode(); 
+		final boolean prefixMatch = query.isPrefixMatch();
+        final double minCosine = query.getMinCosine();
+        final double maxCosine = query.getMaxCosine();
+        final int minRank = query.getMinRank();
+        final int maxRank = query.getMaxRank(); 
+        final boolean matchAllTerms = query.isMatchAllTerms();
+        final boolean matchExact = query.isMatchExact();
+        final String regex = query.getMatchRegex();
+        long timeout = query.getTimeout();
+        final TimeUnit unit = query.getTimeUnit(); 
+
         final long begin = System.currentTimeMillis();
         
 //        if (languageCode == null)
 //            throw new IllegalArgumentException();
 
-        if (query == null)
+        if (queryStr == null)
             throw new IllegalArgumentException();
         
         if (minCosine < 0d || minCosine > 1d)
@@ -1002,7 +1104,7 @@
             throw new IllegalArgumentException();
         
         if (log.isInfoEnabled())
-            log.info("languageCode=[" + languageCode + "], text=[" + query
+            log.info("languageCode=[" + languageCode + "], text=[" + queryStr
                     + "], minCosine=" + minCosine 
                     + ", maxCosine=" + maxCosine
                     + ", minRank=" + minRank
@@ -1018,7 +1120,7 @@
             
         }
         
-        final FullTextQuery cacheKey = q;
+        final FullTextQuery cacheKey = query;
         
         Hit<V>[] a;
         
@@ -1034,145 +1136,24 @@
         	if (log.isInfoEnabled())
         		log.info("did not find hits in cache");
         	
-	        // tokenize the query.
-	        final TermFrequencyData<V> qdata;
-	        {
-	            
-	            final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this);
-	
-	            /*
-	             * If we are using prefix match ('*' operator) then we don't want to
-	             * filter stopwords from the search query.
-	             */
-	            final boolean filterStopwords = !prefixMatch;
-	
-	            index(buffer, //
-	                    null, // docId // was Long.MIN_VALUE
-	                    Integer.MIN_VALUE, // fieldId
-	                    languageCode,//
-	                    new StringReader(query), //
-	                    filterStopwords//
-	            );
-	
-	            if (buffer.size() == 0) {
-	
-	                /*
-	                 * There were no terms after stopword extration.
-	                 */
-	
-	                log.warn("No terms after stopword extraction: query=" + query);
-	
-	                a = new Hit[] {};
-	                
-	                cache.put(cacheKey, a);
-	                
-	                return a; 
-	
-	            }
-	            
-	            qdata = buffer.get(0);
-	            
-	            qdata.normalize();
-	            
-	        }
-	
-	        final IHitCollector<V> hits;
-	        
-	        if (qdata.distinctTermCount() == 1) {
-	        	
-	        	final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
-	        	
-                final String termText = e.getKey();
+            // tokenize the query.
+            final TermFrequencyData<V> qdata = tokenize(query);
+            
+            // No terms after stopword extraction
+            if (qdata == null) {
             	
-                final ITermMetadata md = e.getValue();
-
-                final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, prefixMatch, md
-                        .getLocalTermWeight(), this);
-                
-                hits = new SingleTokenHitCollector<V>(task1);
-	        	
-	        } else {
-	        	
-	            final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>(
-	                    qdata.distinctTermCount());
-	
-	            int i = 0;
-	            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
-	
-	                final String termText = e.getKey();
-	
-	                final ITermMetadata md = e.getValue();
-	
-	                tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), prefixMatch, md
-	                        .getLocalTermWeight(), this));
-	
-	            }
-	            
-	            hits = new MultiTokenHitCollector<V>(tasks);
-	        	
-	        }
-	        
-	        // run the queries.
-	        {
-	
-	            final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>(
-	                    qdata.distinctTermCount());
-	
-	            int i = 0;
-	            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
-	
-	                final String termText = e.getKey();
-	
-	                final ITermMetadata md = e.getValue();
-	
-	                tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(),
-	                		prefixMatch, md.getLocalTermWeight(), this, hits));
-	
-	            }
-	
-	            final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>(
-	                    getExecutorService(), timeout, unit);
-	
-	            try {
-	
-	    	        final long start = System.currentTimeMillis();
-	    	        
-	                executionHelper.submitTasks(tasks);
-	                
-	                if (log.isInfoEnabled()) {
-		                final long readTime = System.currentTimeMillis() - start;
-		                log.info("read time: " + readTime);
-	                }
-	                
-	            } catch (InterruptedException ex) {
-	
-	            	if (log.isInfoEnabled()) {
-		                // TODO Should we wrap and toss this interrupt instead?
-		                log.info("Interrupted - only partial results will be returned.");
-	            	}
-	                
-	            	/*
-	            	 * Yes, let's toss it.  We were getting into a situation
-	            	 * where the ExecutionHelper above received an interrupt
-	            	 * but we still went through the heavy-weight filtering
-	            	 * operations below (matchExact or matchRegex).
-	            	 */
-	            	throw new RuntimeException(ex);
-
-	            } catch (ExecutionException ex) {
-	
-	                throw new RuntimeException(ex);
-	                
-	            }
-	
-	        }
-	        
-	        a = hits.getHits();
-	        
+            	cache.put(cacheKey, a = new Hit[] {});
+            	
+            	return a;
+            			
+            }
+            
+            a = executeQuery(qdata, prefixMatch, timeout, unit);
+            
 	        if (a.length == 0) {
 	        	
 	            log.info("No hits: languageCode=[" + languageCode + "], query=["
-	                    + query + "]");
+	                    + queryStr + "]");
 	            
 	            cache.put(cacheKey, a);
 	            
@@ -1223,14 +1204,14 @@
 	         */
 	        if (matchExact) {
 	        	
-	        	a = matchExact(a, query);
+	        	a = matchExact(a, queryStr);
 	        	
 	        }
 	        
 	        if (a.length == 0) {
 	        	
 	            log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=["
-	                    + query + "]");
+	                    + queryStr + "]");
 	            
 	            cache.put(cacheKey, a);
 	            
@@ -1260,7 +1241,7 @@
 	        if (a.length == 0) {
 	        	
 	            log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=["
-	                    + query + "], regex=[" + regex + "]");
+	                    + queryStr + "], regex=[" + regex + "]");
 	            
 	            cache.put(cacheKey, a);
 	            
@@ -1299,6 +1280,27 @@
         
         }
         
+        /*
+         * Take a slice of the hits based on min/max cosine and min/max rank.
+         */
+        a = slice(query, a);
+        
+        final long elapsed = System.currentTimeMillis() - begin;
+        
+        if (log.isInfoEnabled())
+            log.info("Done: " + a.length + " hits in " + elapsed + "ms");
+
+        return a;
+        
+    }
+    
+    protected Hit<V>[] slice(final FullTextQuery query, Hit<V>[] a) {
+    	
+        final double minCosine = query.getMinCosine();
+        final double maxCosine = query.getMaxCosine();
+        final int minRank = query.getMinRank();
+        final int maxRank = query.getMaxRank();
+        
 //        if (log.isDebugEnabled()) {
 //        	log.debug("before min/max cosine/rank pruning:");
 //        	for (Hit<V> h : a)
@@ -1422,13 +1424,106 @@
         	
         }
 
-        final long elapsed = System.currentTimeMillis() - begin;
+        return a;
         
-        if (log.isInfoEnabled())
-            log.info("Done: " + a.length + " hits in " + elapsed + "ms");
+    }
+    
+    protected Hit<V>[] executeQuery(final TermFrequencyData<V> qdata,
+    		final boolean prefixMatch, final long timeout, final TimeUnit unit) {
+    	
+        final IHitCollector<V> hits;
+        
+        if (qdata.distinctTermCount() == 1) {
+        	
+        	final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();
+        	
+            final String termText = e.getKey();
+        	
+            final ITermMetadata md = e.getValue();
 
-        return a;
+            final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1,
+            		prefixMatch, md.getLocalTermWeight(), this);
+            
+            hits = new SingleTokenHitCollector<V>(task1);
+        	
+        } else {
+        	
+            final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>(
+                    qdata.distinctTermCount());
+
+            int i = 0;
+            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
+
+                final String termText = e.getKey();
+
+                final ITermMetadata md = e.getValue();
+
+                tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), 
+                		prefixMatch, md.getLocalTermWeight(), this));
+
+            }
+            
+            hits = new MultiTokenHitCollector<V>(tasks);
+        	
+        }
         
+        // run the queries.
+        {
+
+            final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>(
+                    qdata.distinctTermCount());
+
+            int i = 0;
+            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {
+
+                final String termText = e.getKey();
+
+                final ITermMetadata md = e.getValue();
+
+                tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(),
+                		prefixMatch, md.getLocalTermWeight(), this, hits));
+
+            }
+
+            final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>(
+                    getExecutorService(), timeout, unit);
+
+            try {
+
+    	        final long start = System.currentTimeMillis();
+    	        
+                executionHelper.submitTasks(tasks);
+                
+                if (log.isInfoEnabled()) {
+	                final long readTime = System.currentTimeMillis() - start;
+	                log.info("read time: " + readTime);
+                }
+                
+            } catch (InterruptedException ex) {
+
+            	if (log.isInfoEnabled()) {
+	                // TODO Should we wrap and toss this interrupt instead?
+	                log.info("Interrupted - only partial results will be returned.");
+            	}
+                
+            	/*
+            	 * Yes, let's toss it.  We were getting into a situation
+            	 * where the ExecutionHelper above received an interrupt
+            	 * but we still went through the heavy-weight filtering
+            	 * operations below (matchExact or matchRegex).
+            	 */
+            	throw new RuntimeException(ex);
+
+            } catch (ExecutionException ex) {
+
+                throw new RuntimeException(ex);
+                
+            }
+
+        }
+        
+        return hits.getHits();
+
     }
     
     /**

Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java	2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/ASTSearchOptimizer.java	2014-04-02 16:13:03 UTC (rev 8028)
@@ -108,6 +108,7 @@
         set.add(BDS.SUBJECT_SEARCH);
         set.add(BDS.SEARCH_TIMEOUT);
         set.add(BDS.MATCH_REGEX);
+        set.add(BDS.RANGE_COUNT);
         
         searchUris = Collections.unmodifiableSet(set);
         

Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java	2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/sparql/ast/eval/SearchServiceFactory.java	2014-04-02 16:13:03 UTC (rev 8028)
@@ -69,6 +69,7 @@
 import com.bigdata.rdf.store.BDS;
 import com.bigdata.search.Hiterator;
 import com.bigdata.search.IHit;
+import com.bigdata.striterator.ChunkedArrayIterator;
 
 import cutthecrap.utils.striterators.ICloseableIterator;
 
@@ -300,6 +301,10 @@
                 
                 assertObjectIsLiteral(sp);
                 
+            } else if (uri.equals(BDS.RANGE_COUNT)) {
+                
+                assertObjectIsVariable(sp);
+                
             } else if(uri.equals(BDS.MATCH_REGEX)) {
                 
             	// a variable for the object is equivalent to regex = null
@@ -367,6 +372,7 @@
         private final boolean subjectSearch;
         private final Literal searchTimeout;
         private final Literal matchRegex;
+        private final IVariable<?> rangeCountVar;
         
         public SearchCall(
                 final AbstractTripleStore store,
@@ -415,6 +421,7 @@
 
             IVariable<?> relVar = null;
             IVariable<?> rankVar = null;
+            IVariable<?> rangeCountVar = null;
             Literal minRank = null;
             Literal maxRank = null;
             Literal minRelevance = null;
@@ -439,6 +446,8 @@
                     relVar = oVar;
                 } else if (BDS.RANK.equals(p)) {
                     rankVar = oVar;
+                } else if (BDS.RANGE_COUNT.equals(p)) {
+                    rangeCountVar = oVar;
                 } else if (BDS.MIN_RANK.equals(p)) {
                     minRank = (Literal) oVal;
                 } else if (BDS.MAX_RANK.equals(p)) {
@@ -484,6 +493,7 @@
             this.subjectSearch = subjectSearch;
             this.searchTimeout = searchTimeout;
             this.matchRegex = matchRegex;
+            this.rangeCountVar = rangeCountVar;
 
         }
 
@@ -527,6 +537,46 @@
         
         }
 
+        @SuppressWarnings({ "rawtypes", "unchecked" })
+        private int getRangeCount() {
+
+//            final IValueCentricTextIndexer<IHit> textIndex = (IValueCentricTextIndexer) store
+//                    .getLexiconRelation().getSearchEngine();
+            
+        	final ITextIndexer<IHit> textIndex = (ITextIndexer) 
+	    		(this.subjectSearch ?
+	    			store.getLexiconRelation().getSubjectCentricSearchEngine() :
+	    				store.getLexiconRelation().getSearchEngine());
+        	
+            if (textIndex == null)
+                throw new UnsupportedOperationException("No free text index?");
+
+            String s = query.getLabel();
+            final boolean prefixMatch;
+            if (s.indexOf('*') >= 0) {
+                prefixMatch = true;
+                s = s.replaceAll("\\*", "");
+            } else {
+                prefixMatch = false;
+            }
+
+            return textIndex.count(new FullTextQuery(
+        		s,//
+                query.getLanguage(),// 
+                prefixMatch,//
+                matchRegex == null ? null : matchRegex.stringValue(),
+                matchAllTerms,
+                matchExact,
+                minRelevance == null ? BDS.DEFAULT_MIN_RELEVANCE : minRelevance.doubleValue()/* minCosine */, 
+                maxRelevance == null ? BDS.DEFAULT_MAX_RELEVANCE : maxRelevance.doubleValue()/* maxCosine */, 
+                minRank == null ? BDS.DEFAULT_MIN_RANK/*1*/ : minRank.intValue()/* minRank */,
+                maxRank == null ? BDS.DEFAULT_MAX_RANK/*Integer.MAX_VALUE*/ : maxRank.intValue()/* maxRank */,
+                searchTimeout == null ? BDS.DEFAULT_TIMEOUT/*0L*/ : searchTimeout.longValue()/* timeout */,
+                TimeUnit.MILLISECONDS
+                ));
+        
+        }
+
         /**
          * {@inheritDoc}
          * 
@@ -561,7 +611,24 @@
 
             }
             
-            return new HitConverter(getHiterator());
+            if (rangeCountVar != null) {
+            
+            	final int i = getRangeCount();
+            	
+            	@SuppressWarnings({ "rawtypes", "unchecked" })
+				final ListBindingSet bs = new ListBindingSet(
+            			new IVariable[] { rangeCountVar },
+            			new IConstant[] { new Constant(new XSDNumericIV(i)) });
+            	
+            	return new ChunkedArrayIterator<IBindingSet>(new IBindingSet[] {
+            		bs
+            	});
+            	
+            } else {
+            
+            	return new HitConverter(getHiterator());
+            	
+            }
 
         }
             
@@ -631,11 +698,11 @@
             
                 final ListBindingSet bs = new ListBindingSet(vars, vals);
                 
-                if (log.isInfoEnabled()) {
-                	log.info(bs);
-                	log.info(query.getClass());
-                	log.info(((BigdataLiteral) query).getIV());
-                	log.info(((BigdataLiteral) query).getIV().getClass());
+                if (log.isTraceEnabled()) {
+                	log.trace(bs);
+                	log.trace(query.getClass());
+                	log.trace(((BigdataLiteral) query).getIV());
+                	log.trace(((BigdataLiteral) query).getIV().getClass());
                 }
                 
                 return bs;

Modified: branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java
===================================================================
--- branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java	2014-04-02 13:14:09 UTC (rev 8027)
+++ branches/BIGDATA_RELEASE_1_3_0/bigdata-rdf/src/java/com/bigdata/rdf/store/BDS.java	2014-04-02 16:13:03 UTC (rev 8028)
@@ -420,5 +420,14 @@
      * The default timeout for a free text search (milliseconds).
      */
     final long DEFAULT_TIMEOUT = Long.MAX_VALUE;
+    
+    /**
+     * Magic predicate to specify that we want a range count done on the search.
+     * Bind the range count to the variable in the object position.  Will
+     * attempt to do a fast range count on the index rather than materializing
+     * the hits into an array.  This is only possible if matchExact == false
+     * and matchRegex == null.
+     */
+    final URI RANGE_COUNT = new URIImpl(NAMESPACE + "rangeCount");
 
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Bigdata-commit] SF.net SVN: bigdata:[8028] branches/BIGDATA_RELEASE_1_3_0

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[8028] branches/BIGDATA_RELEASE_1_3_0