From: <mrp...@us...> - 2011-02-20 23:39:34
|
Revision: 4212 http://bigdata.svn.sourceforge.net/bigdata/?rev=4212&view=rev Author: mrpersonick Date: 2011-02-20 23:39:28 +0000 (Sun, 20 Feb 2011) Log Message: ----------- turn off stopword filter when using prefix match Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-02-20 21:20:44 UTC (rev 4211) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-02-20 23:39:28 UTC (rev 4212) @@ -51,7 +51,10 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; +import org.apache.lucene.util.Version; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IPredicate; @@ -661,6 +664,19 @@ } /** + * See {@link #index(TokenBuffer, long, int, String, Reader, boolean)}. + * <p> + * Uses a default filterStopwords value of true. + * + */ + public void index(final TokenBuffer buffer, final long docId, final int fieldId, + final String languageCode, final Reader r) { + + index(buffer, docId, fieldId, languageCode, r, true/* filterStopwords */); + + } + + /** * Index a field in a document. * <p> * Note: This method does NOT force a write on the indices. If the <i>buffer</i> @@ -684,11 +700,13 @@ * {@link Locale}. * @param r * A reader on the text to be indexed. + * @param filterStopwords + * if true, filter stopwords from the token stream * * @see TokenBuffer#flush() */ public void index(final TokenBuffer buffer, final long docId, final int fieldId, - final String languageCode, final Reader r) { + final String languageCode, final Reader r, final boolean filterStopwords) { /* * Note: You can invoke this on a read-only index. It is only overflow @@ -701,7 +719,7 @@ int n = 0; // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be). - final TokenStream tokenStream = getTokenStream(languageCode, r); + final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords); try { while (tokenStream.incrementToken()) { TermAttribute term=tokenStream.getAttribute(TermAttribute.class); @@ -729,10 +747,14 @@ * * @param r * A reader on the text to be indexed. + * + * @param filterStopwords + * if true, filter stopwords from the token stream * * @return The extracted token stream. */ - protected TokenStream getTokenStream(final String languageCode, final Reader r) { + protected TokenStream getTokenStream(final String languageCode, + final Reader r, final boolean filterStopwords) { /* * Note: This stripping out stopwords by default. @@ -741,9 +763,22 @@ */ final Analyzer a = getAnalyzer(languageCode); - TokenStream tokenStream = a.tokenStream(null/* @todo field? */, r); + TokenStream tokenStream; + if (filterStopwords) { + tokenStream = a.tokenStream(null/* @todo field? */, r); + } else { + /* + * To eliminiate stopword filtering, we simulate the tokenStream() + * operation above per the javadoc for that method, which says: + * "Constructs a StandardTokenizer filtered by a StandardFilter, + * a LowerCaseFilter and a StopFilter", eliminating the StopFilter. + */ + tokenStream = new StandardTokenizer(Version.LUCENE_CURRENT, r); + tokenStream = new StandardFilter(tokenStream); + } // force to lower case. + // might be able to move this inside the else {} block above? tokenStream = new LowerCaseFilter(tokenStream); return tokenStream; @@ -1037,9 +1072,15 @@ final TokenBuffer buffer = new TokenBuffer(1, this); + /* + * If we are using prefix match (* operator) then we don't want + * to filter stopwords from the search query. + */ + final boolean filterStopwords = !prefixMatch; + index(buffer, Long.MIN_VALUE/* docId */, Integer.MIN_VALUE/* fieldId */, languageCode, - new StringReader(query)); + new StringReader(query), filterStopwords); if (buffer.size() == 0) { Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-02-20 21:20:44 UTC (rev 4211) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/test/com/bigdata/rdf/sail/TestSearchQuery.java 2011-02-20 23:39:28 UTC (rev 4212) @@ -40,6 +40,7 @@ import java.util.Set; import java.util.concurrent.TimeUnit; +import org.apache.log4j.Logger; import org.openrdf.model.BNode; import org.openrdf.model.Graph; import org.openrdf.model.Literal; @@ -99,6 +100,8 @@ */ public class TestSearchQuery extends ProxyBigdataSailTestCase { + protected static final Logger log = Logger.getLogger(TestSearchQuery.class); + public TestSearchQuery() { } @@ -708,6 +711,7 @@ final URI s5 = vf.createURI(BD.NAMESPACE+"s5"); final URI s6 = vf.createURI(BD.NAMESPACE+"s6"); final URI s7 = vf.createURI(BD.NAMESPACE+"s7"); + final URI s8 = vf.createURI(BD.NAMESPACE+"s8"); final Literal l1 = vf.createLiteral("how"); final Literal l2 = vf.createLiteral("now"); final Literal l3 = vf.createLiteral("brown"); @@ -715,6 +719,7 @@ final Literal l5 = vf.createLiteral("how now"); final Literal l6 = vf.createLiteral("brown cow"); final Literal l7 = vf.createLiteral("how now brown cow"); + final Literal l8 = vf.createLiteral("toilet"); cxn.add(s1, RDFS.LABEL, l1); cxn.add(s2, RDFS.LABEL, l2); @@ -723,6 +728,7 @@ cxn.add(s5, RDFS.LABEL, l5); cxn.add(s6, RDFS.LABEL, l6); cxn.add(s7, RDFS.LABEL, l7); + cxn.add(s8, RDFS.LABEL, l8); /* * Note: The either flush() or commit() is required to flush the @@ -739,6 +745,7 @@ literals.put(((BigdataValue)l5).getIV(), l5); literals.put(((BigdataValue)l6).getIV(), l6); literals.put(((BigdataValue)l7).getIV(), l7); + literals.put(((BigdataValue)l8).getIV(), l8); final Map<IV, URI> uris = new LinkedHashMap<IV, URI>(); uris.put(((BigdataValue)l1).getIV(), s1); @@ -748,6 +755,7 @@ uris.put(((BigdataValue)l5).getIV(), s5); uris.put(((BigdataValue)l6).getIV(), s6); uris.put(((BigdataValue)l7).getIV(), s7); + uris.put(((BigdataValue)l8).getIV(), s8); /**/ if (log.isInfoEnabled()) { @@ -1066,6 +1074,71 @@ } + { // prefix match using a stopword + + final String searchQuery = "to*"; + final double minRelevance = 0.0d; + + final String query = + "select ?s ?o ?score " + + "where " + + "{ " + + " ?s <"+RDFS.LABEL+"> ?o . " + + " ?o <"+BD.SEARCH+"> \""+searchQuery+"\" . " + + " ?o <"+BD.RELEVANCE+"> ?score . " + +// " ?o <"+BD.MIN_RELEVANCE+"> \""+minRelevance+"\" . " + +// " ?o <"+BD.MAX_HITS+"> \"5\" . " + +// " filter regex(?o, \""+searchQuery+"\") " + + "} " + + "order by desc(?score)"; + + log.info("\n"+query); + + final TupleQuery tupleQuery = + cxn.prepareTupleQuery(QueryLanguage.SPARQL, query); + tupleQuery.setIncludeInferred(true /* includeInferred */); + TupleQueryResult result = tupleQuery.evaluate(); + + int i = 0; + while (result.hasNext()) { + log.info(i++ + ": " + result.next().toString()); + } + assertTrue("wrong # of results: " + i, i == 1); + + result = tupleQuery.evaluate(); + + Collection<BindingSet> answer = new LinkedList<BindingSet>(); + + final ITextIndexer search = + sail.getDatabase().getLexiconRelation().getSearchEngine(); + final Hiterator<IHit> hits = + search.search(searchQuery, + null, // languageCode + true, // prefixMatch + minRelevance, // minCosine + 10000, // maxRank (=maxResults + 1) + 1000L, // timeout + TimeUnit.MILLISECONDS // unit + ); + + while (hits.hasNext()) { + final IHit hit = hits.next(); + final IV id = new TermId(VTE.LITERAL, hit.getDocId()); + final Literal score = vf.createLiteral(hit.getCosine()); + final URI s = uris.get(id); + final Literal o = literals.get(id); + final BindingSet bs = createBindingSet( + new BindingImpl("s", s), + new BindingImpl("o", o), + new BindingImpl("score", score)); + log.info(bs); + answer.add(bs); + } + + compare(result, answer); + + } + } finally { cxn.close(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |