[Bigdata-commit] SF.net SVN: bigdata:[4300] branches/QUADS_QUERY_BRANCH/bigdata

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4300
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4300&view=rev
Author:   thompsonbry
Date:     2011-03-15 13:49:04 +0000 (Tue, 15 Mar 2011)

Log Message:
-----------
A little bit of clean up for the free text index search stuff.

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java
    branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java
    branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/search/FullTextIndex.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -28,8 +28,6 @@
 
 package com.bigdata.search;
 
-import info.aduna.i18n.languagetag.IanaLanguageTag;
-
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -51,10 +49,7 @@
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.util.Version;
 
 import com.bigdata.bop.IBindingSet;
 import com.bigdata.bop.IPredicate;
@@ -1017,7 +1012,7 @@
      *       iterator that is sent to the data service such that the search
      *       terms are visited only when they occur in the matching field(s).
      */
-    public Hiterator search(final String query, final String languageCode,
+    public Hiterator<Hit> search(final String query, final String languageCode,
             final boolean prefixMatch, final double minCosine,
             final int maxRank, long timeout, final TimeUnit unit) {
 
@@ -1048,7 +1043,7 @@
 
         if (timeout == 0L) {
 
-            // treat ZERO as eqivalent to MAX_LONG.
+            // treat ZERO as equivalent to MAX_LONG.
             timeout = Long.MAX_VALUE;
             
         }
@@ -1188,11 +1183,6 @@
         throw new UnsupportedOperationException();
     }
 
-//    @SuppressWarnings("unchecked")
-//    public IAccessPath getAccessPath(IPredicate predicate) {
-//        throw new UnsupportedOperationException();
-//    }
-
     public Set<String> getIndexNames() {
         throw new UnsupportedOperationException();
     }

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -37,8 +37,8 @@
 import com.bigdata.rdf.internal.IV;
 import com.bigdata.rdf.model.BigdataValue;
 import com.bigdata.rdf.store.AbstractTripleStore;
-import com.bigdata.rdf.store.IRawTripleStore;
 import com.bigdata.search.FullTextIndex;
+import com.bigdata.search.Hit;
 import com.bigdata.search.TokenBuffer;
 
 /**
@@ -48,18 +48,18 @@
  * @version $Id$
  */
 public class BigdataRDFFullTextIndex extends FullTextIndex implements
-        ITextIndexer {
+        ITextIndexer<Hit> {
 
-    static public ITextIndexer getInstance(final IIndexManager indexManager,
-            final String namespace, final Long timestamp,
-            final Properties properties) {
+    static public BigdataRDFFullTextIndex getInstance(
+            final IIndexManager indexManager, final String namespace,
+            final Long timestamp, final Properties properties) {
 
         if (namespace == null)
             throw new IllegalArgumentException();
-        
+
         return new BigdataRDFFullTextIndex(indexManager, namespace, timestamp,
                 properties);
-        
+
     }
 
     /**
@@ -79,8 +79,9 @@
      * @param timestamp
      * @param properties
      */
-    public BigdataRDFFullTextIndex(IIndexManager indexManager,
-            String namespace, Long timestamp, Properties properties) {
+    public BigdataRDFFullTextIndex(final IIndexManager indexManager,
+            final String namespace, final Long timestamp,
+            final Properties properties) {
 
         super(indexManager, namespace, timestamp, properties);
 
@@ -163,10 +164,9 @@
              * cost of re-indexing each time we see a term.
              */
 
-            final IV termId = val.getIV();
+            final IV<?,?> termId = val.getIV();
 
-            assert termId != null; // the termId must have been
-                                                    // assigned.
+            assert termId != null; // the termId must have been assigned.
 
             // don't bother text indexing inline values for now
             if (termId.isInline()) {

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -37,6 +37,7 @@
 import com.bigdata.rdf.store.AbstractTripleStore;
 import com.bigdata.search.FullTextIndex;
 import com.bigdata.search.Hiterator;
+import com.bigdata.search.IHit;
 
 /**
  * Abstraction for the text indexer for RDF {@link Value}s allowing either the
@@ -46,14 +47,8 @@
  * @version $Id$
  * 
  * @see AbstractTripleStore.Options#TEXT_INDEXER_CLASS
- * 
- * @todo Provide a lucene integration point as an alternative to the
- *       {@link FullTextIndex}. Integrate for query and search of course. For
- *       extra credit, make the Lucene integration cluster aware.
- * 
- * @todo mg4j support (see notes in my email) including clustered support.
  */
-public interface ITextIndexer {
+public interface ITextIndexer<A extends IHit> {
    
     public void create();
 
@@ -83,17 +78,38 @@
      * Return <code>true</code> iff datatype literals are being indexed.
      */
     public boolean getIndexDatatypeLiterals();
-    
-//    public Hiterator search(final String languageCode, final String text)
-//            throws InterruptedException;
-//
-//    public Hiterator search(final String query, final String languageCode,
-//            final boolean prefixMatch);
-//
-//    public Hiterator search(final String query, final String languageCode,
-//            final double minCosine, final int maxRank);
 
-    public Hiterator search(final String query, final String languageCode,
+    /**
+     * Do free text search
+     * 
+     * @param query
+     *            The query (it will be parsed into tokens).
+     * @param languageCode
+     *            The language code that should be used when tokenizing the
+     *            query -or- <code>null</code> to use the default {@link Locale}
+     *            ).
+     * @param prefixMatch
+     *            When <code>true</code>, the matches will be on tokens which
+     *            include the query tokens as a prefix. This includes exact
+     *            matches as a special case when the prefix is the entire token,
+     *            but it also allows longer matches. For example,
+     *            <code>free</code> will be an exact match on <code>free</code>
+     *            but a partial match on <code>freedom</code>. When
+     *            <code>false</code>, only exact matches will be made.
+     * @param minCosine
+     *            The minimum cosine that will be returned.
+     * @param maxRank
+     *            The upper bound on the #of hits in the result set.
+     * @param timeout
+     *            The timeout -or- ZERO (0) for NO timeout (this is equivalent
+     *            to using {@link Long#MAX_VALUE}).
+     * @param unit
+     *            The unit in which the timeout is expressed.
+     * 
+     * @return The result set.
+     */
+    public Hiterator<A> search(final String query, final String languageCode,
             final boolean prefixMatch, final double minCosine,
             final int maxRank, long timeout, final TimeUnit unit);
+
 }

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -813,7 +813,7 @@
      *       already imposes a canonicalizing mapping within for the index name
      *       and timestamp inside of a JVM.
      */
-    public ITextIndexer getSearchEngine() {
+    public ITextIndexer<?> getSearchEngine() {
 
         if (!textIndex)
             return null;
@@ -829,13 +829,13 @@
 
                 if (viewRef.get() == null) {
 
-                    final ITextIndexer tmp;
+                    final ITextIndexer<?> tmp;
                     try {
                         final Class<?> vfc = determineTextIndexerClass();
                         final Method gi = vfc.getMethod("getInstance",
                                 IIndexManager.class, String.class, Long.class,
                                 Properties.class);
-                        tmp = (ITextIndexer) gi.invoke(null/* object */,
+                        tmp = (ITextIndexer<?>) gi.invoke(null/* object */,
                                 getIndexManager(), getNamespace(),
                                 getTimestamp(), getProperties());
                         if(tmp instanceof ILocatableResource<?>) {

Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/BigdataEvaluationStrategyImpl3.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -1986,7 +1986,7 @@
             log.debug("languageCode=" + languageCode + ", label=" + label);
         }
         
-        final Iterator<IHit> itr = database.getLexiconRelation()
+        final Iterator<IHit> itr = (Iterator)database.getLexiconRelation()
                 .getSearchEngine().search(label, languageCode,
                         false/* prefixMatch */, 0d/* minCosine */,
                         10000/* maxRank */, 1000L/* timeout */,

Modified: branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java	2011-03-14 18:19:34 UTC (rev 4299)
+++ branches/QUADS_QUERY_BRANCH/bigdata-sails/src/java/com/bigdata/rdf/sail/FreeTextSearchExpander.java	2011-03-15 13:49:04 UTC (rev 4300)
@@ -1,5 +1,6 @@
 package com.bigdata.rdf.sail;
 
+import java.io.Serializable;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
@@ -46,23 +47,28 @@
  */
 public class FreeTextSearchExpander implements IAccessPathExpander<ISPO> {
     
-    protected static final Logger log = Logger.getLogger(FreeTextSearchExpander.class);
+    private static final transient Logger log = Logger
+            .getLogger(FreeTextSearchExpander.class);
     
-    protected static final boolean INFO = log.isInfoEnabled();
-
-    protected static final boolean DEBUG = log.isDebugEnabled();
-    
     private static final long serialVersionUID = 1L;
     
+    private static final transient ISPO[] EMPTY = new ISPO[0];
+
+    /**
+     * FIXME This reference is NOT {@link Serializable}, but the expander is.
+     */
     private final AbstractTripleStore database;
     
     private final Literal query, maxHits, minRelevance;
     
-    private Set<URI> graphs;
+    /** Note: volatile for visibility (or use AtomicReference). */
+    private volatile Set<URI> graphs;
     
     public FreeTextSearchExpander(final AbstractTripleStore database,
             final Literal query) {
-    	this(database, query, null, null);
+
+        this(database, query, null, null);
+        
     }
     
     public FreeTextSearchExpander(final AbstractTripleStore database,
@@ -113,7 +119,7 @@
      * @param graphs
      *          The set of named graphs to use in the filtering process.
      */
-    public void addNamedGraphsFilter(Set<URI> graphs) {
+    public void addNamedGraphsFilter(final Set<URI> graphs) {
         
         this.graphs = graphs;
         
@@ -125,7 +131,7 @@
         
         private Hiterator<IHit> hiterator;
         
-        public FreeTextSearchAccessPath(IAccessPath<ISPO> accessPath) {
+        public FreeTextSearchAccessPath(final IAccessPath<ISPO> accessPath) {
 //            final SPOPredicate pred = (SPOPredicate) accessPath.getPredicate();
 //            IVariableOrConstant<IV> p = pred.p();
 //            IVariableOrConstant<IV> o = pred.o();
@@ -136,19 +142,17 @@
         }
         
         private Hiterator<IHit> getHiterator() {
+
             if (hiterator == null) {
-                assert database!=null;
-                assert query != null;
                 
-                final ITextIndexer textNdx = 
+                @SuppressWarnings("unchecked")
+                final ITextIndexer<IHit> textNdx = (ITextIndexer) 
                 	database.getLexiconRelation().getSearchEngine();
                 
                 if (textNdx == null)
                     throw new UnsupportedOperationException(
                             "No free text index?");
                 
-//                final long begin = System.nanoTime();
-                
                 String s = query.getLabel();
                 final boolean prefixMatch;
                 if (s.indexOf('*') >= 0) {
@@ -157,27 +161,26 @@
                 } else {
                 	prefixMatch = false;
                 }
-                
+
+                /*
+                 * FIXME This is using a constant (1000ms) for the timeout on
+                 * the free text search. That needs to be passed down from the
+                 * SAIL.
+                 * 
+                 * @todo Rather than explicitly passing in all of these as
+                 * parameters to the constructor, why not pass them through as
+                 * annotations on the magic predicate?
+                 */
                 hiterator = textNdx.search(s,
                                 query.getLanguage(), 
                                 prefixMatch,
                                 minRelevance == null ? 0d : minRelevance.doubleValue()/* minCosine */, 
                                 maxHits == null ? 10000 : maxHits.intValue()+1/* maxRank */,
                                 1000L/* timeout */, TimeUnit.MILLISECONDS);
-//                hiterator = database.getSearchEngine().search
-//                    ( query.getLabel(),
-//                      query.getLanguage(), 
-//                      0d, // @todo param for minCosine,
-//                      10000 // @todo param for maxRank,
-////                      timeout,
-////                      unit
-//                      );
-//                final long elapsed = System.nanoTime() - begin;
-//                log.warn("search time="
-//                        + TimeUnit.MILLISECONDS.convert(elapsed,
-//                                TimeUnit.NANOSECONDS)+", query="+query+", nhits="+hiterator.size());
             }
+
             return hiterator;
+            
         }
         
         public IIndex getIndex() {
@@ -189,7 +192,7 @@
         /**
          * The results are in decreasing cosine (aka relevance) order.
          * 
-         * @return <code>null</code> since the results are not in any
+         * @return <code>null</code> since the results are not in any defined
          *         {@link SPOKeyOrder}.
          */
         public IKeyOrder<ISPO> getKeyOrder() {
@@ -213,28 +216,6 @@
         }
 
         public IChunkedOrderedIterator<ISPO> iterator() {
-
-//            /*
-//             * FIXME remove. times the search hit converter but has side effect.
-//             */
-//            {
-//                final IChunkedOrderedIterator<IHit> itr2 = new ChunkedWrappedIterator<IHit>(
-//                        getHiterator());
-//
-//                final IChunkedOrderedIterator<ISPO> itr3 = new ChunkedConvertingIterator<IHit, ISPO>(
-//                        itr2, new HitConverter(accessPath));
-//
-//                final long begin = System.nanoTime();
-//                while (itr3.hasNext()) {
-//                    itr3.next();
-//                }
-//                final long elapsed = System.nanoTime() - begin;
-//                log.error("search converting iterator time="
-//                        + TimeUnit.MILLISECONDS.convert(elapsed,
-//                                TimeUnit.NANOSECONDS) + ", query=" + query
-//                        + ", nhits=" + hiterator.size());
-//                hiterator = null; // clear reference since we will need to reobtain the hiterator.
-//            }
             
             final IChunkedOrderedIterator<IHit> itr2 = 
                 new ChunkedWrappedIterator<IHit>(getHiterator());
@@ -256,7 +237,7 @@
                 new ChunkedOrderedStriterator<IChunkedOrderedIterator<ISPO>, ISPO>(itr3).
                 addFilter(new Filter<IChunkedOrderedIterator<ISPO>, ISPO>() {
                     protected boolean isValid(ISPO e) {
-                        BigdataValue val = database.getTerm(e.s());
+                        final BigdataValue val = database.getTerm(e.s());
                         for (URI c : graphs) {
                             if (database.getAccessPath(null, null, val, c).rangeCount(true) > 0) {
                                 return true;
@@ -284,11 +265,11 @@
             
         }
 
-        public long rangeCount(boolean exact) {
+        public long rangeCount(final boolean exactIsIgnored) {
 
             final long rangeCount = getHiterator().size();
 
-            if (INFO)
+            if (log.isInfoEnabled())
                 log.info("range count: " + rangeCount);
 
             return rangeCount;
@@ -309,42 +290,42 @@
         
     }
     
-    private class HitConverter implements IChunkConverter<IHit,ISPO> {
+    static private class HitConverter implements IChunkConverter<IHit,ISPO> {
         
         private final boolean isBound;
         
         private final IV boundVal;
         
-        public HitConverter(IAccessPath<ISPO> accessPath) {
-            SPOPredicate pred = (SPOPredicate) accessPath.getPredicate();
-            IVariableOrConstant<IV> s = pred.s();
+        public HitConverter(final IAccessPath<ISPO> accessPath) {
+            final SPOPredicate pred = (SPOPredicate) accessPath.getPredicate();
+            final IVariableOrConstant<IV> s = pred.s();
             this.isBound = s.isConstant();
-            if (INFO) log.info("isBound: " + isBound);
             this.boundVal = isBound ? s.get() : null;
-            if (INFO) log.info("boundVal: " + boundVal);
+            if (log.isInfoEnabled())
+                log.info("isBound=" + isBound + ", boundVal: " + boundVal);
         }
 
-        public ISPO[] convert(IChunkedOrderedIterator<IHit> src) {
-            if (DEBUG) log.debug("converting chunk");
-            IHit[] hits = src.nextChunk();
+        public ISPO[] convert(final IChunkedOrderedIterator<IHit> src) {
+            final IHit[] hits = src.nextChunk();
             if (isBound) {
                 return convertWhenBound(hits);
             }
-            ISPO[] spos = new ISPO[hits.length];
+            final ISPO[] spos = new ISPO[hits.length];
             for (int i = 0; i < hits.length; i++) {
                 final IV s = new TermId(VTE.LITERAL, hits[i].getDocId());
                 final IV p = new XSDDoubleIV(hits[i].getCosine());
                 final IV o = null; // reserved
                 final IV c = null; // reserved
                 spos[i] = new SPO(s, p, o, c);
-                if (INFO) log.info("hit: " + spos[i]);
+                if (log.isInfoEnabled())
+                    log.info("hit: " + spos[i]);
             }
 //            Arrays.sort(spos, SPOKeyOrder.SPO.getComparator());
             return spos;
         }
         
-        private ISPO[] convertWhenBound(IHit[] hits) {
-            ISPO[] result = new ISPO[0];
+        private ISPO[] convertWhenBound(final IHit[] hits) {
+            ISPO[] result = EMPTY;
             for (IHit hit : hits) {
                 final IV s = new TermId(VTE.LITERAL, hit.getDocId());
                 if (s == boundVal) {
@@ -355,7 +336,8 @@
                     break;
                 }
             }
-            if (INFO) log.info("# of results: " + result.length);
+            if (log.isInfoEnabled())
+                log.info("# of results: " + result.length);
             return result;
         }
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Bigdata-commit] SF.net SVN: bigdata:[4300] branches/QUADS_QUERY_BRANCH/bigdata

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4300] branches/QUADS_QUERY_BRANCH/bigdata