From: <tho...@us...> - 2011-03-15 13:49:12
|
Revision: 4300 http://bigdata.svn.sourceforge.net/bigdata/?rev=4300&view=rev Author: thompsonbry Date: 2011-03-15 13:49:04 +0000 (Tue, 15 Mar 2011) Log Message: ----------- A little bit of clean up for the free text index search stuff. Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -28,8 +28,6 @@ package com.bigdata.search; -import info.aduna.i18n.languagetag.IanaLanguageTag; - import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -51,10 +49,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardFilter; -import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; -import org.apache.lucene.util.Version; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IPredicate; @@ -1017,7 +1012,7 @@ * iterator that is sent to the data service such that the search * terms are visited only when they occur in the matching field(s). */ - public Hiterator search(final String query, final String languageCode, + public Hiterator<Hit> search(final String query, final String languageCode, final boolean prefixMatch, final double minCosine, final int maxRank, long timeout, final TimeUnit unit) { @@ -1048,7 +1043,7 @@ if (timeout == 0L) { - // treat ZERO as eqivalent to MAX_LONG. + // treat ZERO as equivalent to MAX_LONG. timeout = Long.MAX_VALUE; } @@ -1188,11 +1183,6 @@ throw new UnsupportedOperationException(); } -// @SuppressWarnings("unchecked") -// public IAccessPath getAccessPath(IPredicate predicate) { -// throw new UnsupportedOperationException(); -// } - public Set<String> getIndexNames() { throw new UnsupportedOperationException(); } Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -37,8 +37,8 @@ import com.bigdata.rdf.internal.IV; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.store.AbstractTripleStore; -import com.bigdata.rdf.store.IRawTripleStore; import com.bigdata.search.FullTextIndex; +import com.bigdata.search.Hit; import com.bigdata.search.TokenBuffer; /** @@ -48,18 +48,18 @@ * @version $Id$ */ public class BigdataRDFFullTextIndex extends FullTextIndex implements - ITextIndexer { + ITextIndexer<Hit> { - static public ITextIndexer getInstance(final IIndexManager indexManager, - final String namespace, final Long timestamp, - final Properties properties) { + static public BigdataRDFFullTextIndex getInstance( + final IIndexManager indexManager, final String namespace, + final Long timestamp, final Properties properties) { if (namespace == null) throw new IllegalArgumentException(); - + return new BigdataRDFFullTextIndex(indexManager, namespace, timestamp, properties); - + } /** @@ -79,8 +79,9 @@ * @param timestamp * @param properties */ - public BigdataRDFFullTextIndex(IIndexManager indexManager, - String namespace, Long timestamp, Properties properties) { + public BigdataRDFFullTextIndex(final IIndexManager indexManager, + final String namespace, final Long timestamp, + final Properties properties) { super(indexManager, namespace, timestamp, properties); @@ -163,10 +164,9 @@ * cost of re-indexing each time we see a term. */ - final IV termId = val.getIV(); + final IV<?,?> termId = val.getIV(); - assert termId != null; // the termId must have been - // assigned. + assert termId != null; // the termId must have been assigned. // don't bother text indexing inline values for now if (termId.isInline()) { Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -37,6 +37,7 @@ import com.bigdata.rdf.store.AbstractTripleStore; import com.bigdata.search.FullTextIndex; import com.bigdata.search.Hiterator; +import com.bigdata.search.IHit; /** * Abstraction for the text indexer for RDF {@link Value}s allowing either the @@ -46,14 +47,8 @@ * @version $Id$ * * @see AbstractTripleStore.Options#TEXT_INDEXER_CLASS - * - * @todo Provide a lucene integration point as an alternative to the - * {@link FullTextIndex}. Integrate for query and search of course. For - * extra credit, make the Lucene integration cluster aware. - * - * @todo mg4j support (see notes in my email) including clustered support. */ -public interface ITextIndexer { +public interface ITextIndexer<A extends IHit> { public void create(); @@ -83,17 +78,38 @@ * Return <code>true</code> iff datatype literals are being indexed. */ public boolean getIndexDatatypeLiterals(); - -// public Hiterator search(final String languageCode, final String text) -// throws InterruptedException; -// -// public Hiterator search(final String query, final String languageCode, -// final boolean prefixMatch); -// -// public Hiterator search(final String query, final String languageCode, -// final double minCosine, final int maxRank); - public Hiterator search(final String query, final String languageCode, + /** + * Do free text search + * + * @param query + * The query (it will be parsed into tokens). + * @param languageCode + * The language code that should be used when tokenizing the + * query -or- <code>null</code> to use the default {@link Locale} + * ). + * @param prefixMatch + * When <code>true</code>, the matches will be on tokens which + * include the query tokens as a prefix. This includes exact + * matches as a special case when the prefix is the entire token, + * but it also allows longer matches. For example, + * <code>free</code> will be an exact match on <code>free</code> + * but a partial match on <code>freedom</code>. When + * <code>false</code>, only exact matches will be made. + * @param minCosine + * The minimum cosine that will be returned. + * @param maxRank + * The upper bound on the #of hits in the result set. + * @param timeout + * The timeout -or- ZERO (0) for NO timeout (this is equivalent + * to using {@link Long#MAX_VALUE}). + * @param unit + * The unit in which the timeout is expressed. + * + * @return The result set. + */ + public Hiterator<A> search(final String query, final String languageCode, final boolean prefixMatch, final double minCosine, final int maxRank, long timeout, final TimeUnit unit); + } Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -813,7 +813,7 @@ * already imposes a canonicalizing mapping within for the index name * and timestamp inside of a JVM. */ - public ITextIndexer getSearchEngine() { + public ITextIndexer<?> getSearchEngine() { if (!textIndex) return null; @@ -829,13 +829,13 @@ if (viewRef.get() == null) { - final ITextIndexer tmp; + final ITextIndexer<?> tmp; try { final Class<?> vfc = determineTextIndexerClass(); final Method gi = vfc.getMethod("getInstance", IIndexManager.class, String.class, Long.class, Properties.class); - tmp = (ITextIndexer) gi.invoke(null/* object */, + tmp = (ITextIndexer<?>) gi.invoke(null/* object */, getIndexManager(), getNamespace(), getTimestamp(), getProperties()); if(tmp instanceof ILocatableResource<?>) { Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -1986,7 +1986,7 @@ log.debug("languageCode=" + languageCode + ", label=" + label); } - final Iterator<IHit> itr = database.getLexiconRelation() + final Iterator<IHit> itr = (Iterator)database.getLexiconRelation() .getSearchEngine().search(label, languageCode, false/* prefixMatch */, 0d/* minCosine */, 10000/* maxRank */, 1000L/* timeout */, Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-03-14 18:19:34 UTC (rev 4299) +++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java 2011-03-15 13:49:04 UTC (rev 4300) @@ -1,5 +1,6 @@ package com.bigdata.rdf.sail; +import java.io.Serializable; import java.util.Set; import java.util.concurrent.TimeUnit; @@ -46,23 +47,28 @@ */ public class FreeTextSearchExpander implements IAccessPathExpander<ISPO> { - protected static final Logger log = Logger.getLogger(FreeTextSearchExpander.class); + private static final transient Logger log = Logger + .getLogger(FreeTextSearchExpander.class); - protected static final boolean INFO = log.isInfoEnabled(); - - protected static final boolean DEBUG = log.isDebugEnabled(); - private static final long serialVersionUID = 1L; + private static final transient ISPO[] EMPTY = new ISPO[0]; + + /** + * FIXME This reference is NOT {@link Serializable}, but the expander is. + */ private final AbstractTripleStore database; private final Literal query, maxHits, minRelevance; - private Set<URI> graphs; + /** Note: volatile for visibility (or use AtomicReference). */ + private volatile Set<URI> graphs; public FreeTextSearchExpander(final AbstractTripleStore database, final Literal query) { - this(database, query, null, null); + + this(database, query, null, null); + } public FreeTextSearchExpander(final AbstractTripleStore database, @@ -113,7 +119,7 @@ * @param graphs * The set of named graphs to use in the filtering process. */ - public void addNamedGraphsFilter(Set<URI> graphs) { + public void addNamedGraphsFilter(final Set<URI> graphs) { this.graphs = graphs; @@ -125,7 +131,7 @@ private Hiterator<IHit> hiterator; - public FreeTextSearchAccessPath(IAccessPath<ISPO> accessPath) { + public FreeTextSearchAccessPath(final IAccessPath<ISPO> accessPath) { // final SPOPredicate pred = (SPOPredicate) accessPath.getPredicate(); // IVariableOrConstant<IV> p = pred.p(); // IVariableOrConstant<IV> o = pred.o(); @@ -136,19 +142,17 @@ } private Hiterator<IHit> getHiterator() { + if (hiterator == null) { - assert database!=null; - assert query != null; - final ITextIndexer textNdx = + @SuppressWarnings("unchecked") + final ITextIndexer<IHit> textNdx = (ITextIndexer) database.getLexiconRelation().getSearchEngine(); if (textNdx == null) throw new UnsupportedOperationException( "No free text index?"); -// final long begin = System.nanoTime(); - String s = query.getLabel(); final boolean prefixMatch; if (s.indexOf('*') >= 0) { @@ -157,27 +161,26 @@ } else { prefixMatch = false; } - + + /* + * FIXME This is using a constant (1000ms) for the timeout on + * the free text search. That needs to be passed down from the + * SAIL. + * + * @todo Rather than explicitly passing in all of these as + * parameters to the constructor, why not pass them through as + * annotations on the magic predicate? + */ hiterator = textNdx.search(s, query.getLanguage(), prefixMatch, minRelevance == null ? 0d : minRelevance.doubleValue()/* minCosine */, maxHits == null ? 10000 : maxHits.intValue()+1/* maxRank */, 1000L/* timeout */, TimeUnit.MILLISECONDS); -// hiterator = database.getSearchEngine().search -// ( query.getLabel(), -// query.getLanguage(), -// 0d, // @todo param for minCosine, -// 10000 // @todo param for maxRank, -//// timeout, -//// unit -// ); -// final long elapsed = System.nanoTime() - begin; -// log.warn("search time=" -// + TimeUnit.MILLISECONDS.convert(elapsed, -// TimeUnit.NANOSECONDS)+", query="+query+", nhits="+hiterator.size()); } + return hiterator; + } public IIndex getIndex() { @@ -189,7 +192,7 @@ /** * The results are in decreasing cosine (aka relevance) order. * - * @return <code>null</code> since the results are not in any + * @return <code>null</code> since the results are not in any defined * {@link SPOKeyOrder}. */ public IKeyOrder<ISPO> getKeyOrder() { @@ -213,28 +216,6 @@ } public IChunkedOrderedIterator<ISPO> iterator() { - -// /* -// * FIXME remove. times the search hit converter but has side effect. -// */ -// { -// final IChunkedOrderedIterator<IHit> itr2 = new ChunkedWrappedIterator<IHit>( -// getHiterator()); -// -// final IChunkedOrderedIterator<ISPO> itr3 = new ChunkedConvertingIterator<IHit, ISPO>( -// itr2, new HitConverter(accessPath)); -// -// final long begin = System.nanoTime(); -// while (itr3.hasNext()) { -// itr3.next(); -// } -// final long elapsed = System.nanoTime() - begin; -// log.error("search converting iterator time=" -// + TimeUnit.MILLISECONDS.convert(elapsed, -// TimeUnit.NANOSECONDS) + ", query=" + query -// + ", nhits=" + hiterator.size()); -// hiterator = null; // clear reference since we will need to reobtain the hiterator. -// } final IChunkedOrderedIterator<IHit> itr2 = new ChunkedWrappedIterator<IHit>(getHiterator()); @@ -256,7 +237,7 @@ new ChunkedOrderedStriterator<IChunkedOrderedIterator<ISPO>, ISPO>(itr3). addFilter(new Filter<IChunkedOrderedIterator<ISPO>, ISPO>() { protected boolean isValid(ISPO e) { - BigdataValue val = database.getTerm(e.s()); + final BigdataValue val = database.getTerm(e.s()); for (URI c : graphs) { if (database.getAccessPath(null, null, val, c).rangeCount(true) > 0) { return true; @@ -284,11 +265,11 @@ } - public long rangeCount(boolean exact) { + public long rangeCount(final boolean exactIsIgnored) { final long rangeCount = getHiterator().size(); - if (INFO) + if (log.isInfoEnabled()) log.info("range count: " + rangeCount); return rangeCount; @@ -309,42 +290,42 @@ } - private class HitConverter implements IChunkConverter<IHit,ISPO> { + static private class HitConverter implements IChunkConverter<IHit,ISPO> { private final boolean isBound; private final IV boundVal; - public HitConverter(IAccessPath<ISPO> accessPath) { - SPOPredicate pred = (SPOPredicate) accessPath.getPredicate(); - IVariableOrConstant<IV> s = pred.s(); + public HitConverter(final IAccessPath<ISPO> accessPath) { + final SPOPredicate pred = (SPOPredicate) accessPath.getPredicate(); + final IVariableOrConstant<IV> s = pred.s(); this.isBound = s.isConstant(); - if (INFO) log.info("isBound: " + isBound); this.boundVal = isBound ? s.get() : null; - if (INFO) log.info("boundVal: " + boundVal); + if (log.isInfoEnabled()) + log.info("isBound=" + isBound + ", boundVal: " + boundVal); } - public ISPO[] convert(IChunkedOrderedIterator<IHit> src) { - if (DEBUG) log.debug("converting chunk"); - IHit[] hits = src.nextChunk(); + public ISPO[] convert(final IChunkedOrderedIterator<IHit> src) { + final IHit[] hits = src.nextChunk(); if (isBound) { return convertWhenBound(hits); } - ISPO[] spos = new ISPO[hits.length]; + final ISPO[] spos = new ISPO[hits.length]; for (int i = 0; i < hits.length; i++) { final IV s = new TermId(VTE.LITERAL, hits[i].getDocId()); final IV p = new XSDDoubleIV(hits[i].getCosine()); final IV o = null; // reserved final IV c = null; // reserved spos[i] = new SPO(s, p, o, c); - if (INFO) log.info("hit: " + spos[i]); + if (log.isInfoEnabled()) + log.info("hit: " + spos[i]); } // Arrays.sort(spos, SPOKeyOrder.SPO.getComparator()); return spos; } - private ISPO[] convertWhenBound(IHit[] hits) { - ISPO[] result = new ISPO[0]; + private ISPO[] convertWhenBound(final IHit[] hits) { + ISPO[] result = EMPTY; for (IHit hit : hits) { final IV s = new TermId(VTE.LITERAL, hit.getDocId()); if (s == boundVal) { @@ -355,7 +336,8 @@ break; } } - if (INFO) log.info("# of results: " + result.length); + if (log.isInfoEnabled()) + log.info("# of results: " + result.length); return result; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |