[Bigdata-commit] SF.net SVN: bigdata:[4212] branches/QUADS_QUERY_BRANCH/bigdata

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4212
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4212&view=rev
Author:   mrpersonick
Date:     2011-02-20 23:39:28 +0000 (Sun, 20 Feb 2011)

Log Message:
-----------
turn off stopword filter when using prefix match

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2011-02-20 21:20:44 UTC (rev 4211)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2011-02-20 23:39:28 UTC (rev 4212)
@@ -51,7 +51,10 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 
 import com.bigdata.bop.IBindingSet;
 import com.bigdata.bop.IPredicate;
@@ -661,6 +664,19 @@
     }
 
     /**
+     * See {@link #index(TokenBuffer, long, int, String, Reader, boolean)}.
+     * <p>
+     * Uses a default filterStopwords value of true.
+     * 
+     */
+    public void index(final TokenBuffer buffer, final long docId, final int fieldId,
+            final String languageCode, final Reader r) {
+    	
+    	index(buffer, docId, fieldId, languageCode, r, true/* filterStopwords */);
+    	
+    }
+    
+    /**
      * Index a field in a document.
      * <p>
      * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
@@ -684,11 +700,13 @@
      *            {@link Locale}.
      * @param r
      *            A reader on the text to be indexed.
+     * @param filterStopwords
+     * 			  if true, filter stopwords from the token stream            
      * 
      * @see TokenBuffer#flush()
      */
     public void index(final TokenBuffer buffer, final long docId, final int fieldId,
-            final String languageCode, final Reader r) {
+            final String languageCode, final Reader r, final boolean filterStopwords) {
 
         /*
          * Note: You can invoke this on a read-only index. It is only overflow
@@ -701,7 +719,7 @@
         int n = 0;
         
         // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
-        final TokenStream tokenStream = getTokenStream(languageCode, r);
+        final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);
         try {
         while (tokenStream.incrementToken()) {
             TermAttribute term=tokenStream.getAttribute(TermAttribute.class);
@@ -729,10 +747,14 @@
      * 
      * @param r
      *            A reader on the text to be indexed.
+     *            
+     * @param filterStopwords
+     * 			  if true, filter stopwords from the token stream            
      * 
      * @return The extracted token stream.
      */
-    protected TokenStream getTokenStream(final String languageCode, final Reader r) {
+    protected TokenStream getTokenStream(final String languageCode, 
+    		final Reader r, final boolean filterStopwords) {
 
         /*
          * Note: This stripping out stopwords by default.
@@ -741,9 +763,22 @@
          */
         final Analyzer a = getAnalyzer(languageCode);
         
-        TokenStream tokenStream = a.tokenStream(null/* @todo field? */, r);
+        TokenStream tokenStream;
+        if (filterStopwords) {
+        	tokenStream = a.tokenStream(null/* @todo field? */, r);
+        } else {
+        	/*
+        	 * To eliminiate stopword filtering, we simulate the tokenStream()
+        	 * operation above per the javadoc for that method, which says:
+        	 * "Constructs a StandardTokenizer filtered by a StandardFilter, 
+        	 * a LowerCaseFilter and a StopFilter", eliminating the StopFilter.
+        	 */
+        	tokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, r);
+        	tokenStream = new StandardFilter(tokenStream);
+        }
         
         // force to lower case.
+        // might be able to move this inside the else {} block above?
         tokenStream = new LowerCaseFilter(tokenStream);
         
         return tokenStream;
@@ -1037,9 +1072,15 @@
             
             final TokenBuffer buffer = new TokenBuffer(1, this);
             
+            /*
+             * If we are using prefix match (* operator) then we don't want
+             * to filter stopwords from the search query.
+             */
+            final boolean filterStopwords = !prefixMatch;
+            
             index(buffer, Long.MIN_VALUE/* docId */,
                     Integer.MIN_VALUE/* fieldId */, languageCode,
-                    new StringReader(query));
+                    new StringReader(query), filterStopwords);
 
             if (buffer.size() == 0) {
 

Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java	2011-02-20 21:20:44 UTC (rev 4211)
+++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java	2011-02-20 23:39:28 UTC (rev 4212)
@@ -40,6 +40,7 @@
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
+import org.apache.log4j.Logger;
 import org.openrdf.model.BNode;
 import org.openrdf.model.Graph;
 import org.openrdf.model.Literal;
@@ -99,6 +100,8 @@
  */
 public class TestSearchQuery extends ProxyBigdataSailTestCase {
 
+	protected static final Logger log = Logger.getLogger(TestSearchQuery.class);
+	
     public TestSearchQuery() {
         
     }
@@ -708,6 +711,7 @@
         	final URI s5 = vf.createURI(BD.NAMESPACE+"s5");
         	final URI s6 = vf.createURI(BD.NAMESPACE+"s6");
         	final URI s7 = vf.createURI(BD.NAMESPACE+"s7");
+        	final URI s8 = vf.createURI(BD.NAMESPACE+"s8");
         	final Literal l1 = vf.createLiteral("how");
         	final Literal l2 = vf.createLiteral("now");
         	final Literal l3 = vf.createLiteral("brown");
@@ -715,6 +719,7 @@
         	final Literal l5 = vf.createLiteral("how now");
         	final Literal l6 = vf.createLiteral("brown cow");
         	final Literal l7 = vf.createLiteral("how now brown cow");
+        	final Literal l8 = vf.createLiteral("toilet");
         	
             cxn.add(s1, RDFS.LABEL, l1);
             cxn.add(s2, RDFS.LABEL, l2);
@@ -723,6 +728,7 @@
             cxn.add(s5, RDFS.LABEL, l5);
             cxn.add(s6, RDFS.LABEL, l6);
             cxn.add(s7, RDFS.LABEL, l7);
+            cxn.add(s8, RDFS.LABEL, l8);
             
             /*
              * Note: The either flush() or commit() is required to flush the
@@ -739,6 +745,7 @@
             literals.put(((BigdataValue)l5).getIV(), l5);
             literals.put(((BigdataValue)l6).getIV(), l6);
             literals.put(((BigdataValue)l7).getIV(), l7);
+            literals.put(((BigdataValue)l8).getIV(), l8);
             
             final Map<IV, URI> uris = new LinkedHashMap<IV, URI>();
             uris.put(((BigdataValue)l1).getIV(), s1);
@@ -748,6 +755,7 @@
             uris.put(((BigdataValue)l5).getIV(), s5);
             uris.put(((BigdataValue)l6).getIV(), s6);
             uris.put(((BigdataValue)l7).getIV(), s7);
+            uris.put(((BigdataValue)l8).getIV(), s8);
             
 /**/            
             if (log.isInfoEnabled()) {
@@ -1066,6 +1074,71 @@
 
             }
 
+            { // prefix match using a stopword
+            	
+            	final String searchQuery = "to*";
+            	final double minRelevance = 0.0d;
+            	
+                final String query = 
+                    "select ?s ?o ?score " + 
+                    "where " +
+                    "{ " +
+                    "    ?s <"+RDFS.LABEL+"> ?o . " +
+                    "    ?o <"+BD.SEARCH+"> \""+searchQuery+"\" . " +
+                    "    ?o <"+BD.RELEVANCE+"> ?score . " +
+//                    "    ?o <"+BD.MIN_RELEVANCE+"> \""+minRelevance+"\" . " +
+//                    "    ?o <"+BD.MAX_HITS+"> \"5\" . " +
+//                    "    filter regex(?o, \""+searchQuery+"\") " +
+                    "} " +
+                    "order by desc(?score)";
+                
+                log.info("\n"+query);
+                
+                final TupleQuery tupleQuery = 
+                    cxn.prepareTupleQuery(QueryLanguage.SPARQL, query);
+                tupleQuery.setIncludeInferred(true /* includeInferred */);
+                TupleQueryResult result = tupleQuery.evaluate();
+
+                int i = 0;
+                while (result.hasNext()) {
+                	log.info(i++ + ": " + result.next().toString());
+                }
+                assertTrue("wrong # of results: " + i, i == 1);
+                
+                result = tupleQuery.evaluate();
+
+                Collection<BindingSet> answer = new LinkedList<BindingSet>();
+                
+                final ITextIndexer search = 
+                	sail.getDatabase().getLexiconRelation().getSearchEngine();
+                final Hiterator<IHit> hits = 
+                	search.search(searchQuery, 
+                            null, // languageCode
+                            true, // prefixMatch
+                            minRelevance, // minCosine
+                            10000, // maxRank (=maxResults + 1)
+                            1000L, // timeout 
+                            TimeUnit.MILLISECONDS // unit
+                            );
+                
+                while (hits.hasNext()) {
+                	final IHit hit = hits.next();
+                	final IV id = new TermId(VTE.LITERAL, hit.getDocId());
+                	final Literal score = vf.createLiteral(hit.getCosine());
+                	final URI s = uris.get(id);
+                	final Literal o = literals.get(id);
+                    final BindingSet bs = createBindingSet(
+                    		new BindingImpl("s", s),
+                    		new BindingImpl("o", o),
+                    		new BindingImpl("score", score));
+                    log.info(bs);
+                    answer.add(bs);
+                }
+                
+                compare(result, answer);
+
+            }
+
         } finally {
             cxn.close();
         }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Bigdata-commit] SF.net SVN: bigdata:[4212] branches/QUADS_QUERY_BRANCH/bigdata

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4212] branches/QUADS_QUERY_BRANCH/bigdata