[Bigdata-commit] SF.net SVN: bigdata:[4266] branches/QUADS_QUERY_BRANCH/bigdata

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4266
          http://bigdata.svn.sourceforge.net/bigdata/?rev=4266&view=rev
Author:   thompsonbry
Date:     2011-03-03 18:38:51 +0000 (Thu, 03 Mar 2011)

Log Message:
-----------
https://sourceforge.net/apps/trac/bigdata/ticket/265

- Added a method to re-build the full text index and a unit test for the same (see TestFullTextIndex).

- Javadoc edits to the ITextIndex interface.

https://sourceforge.net/apps/trac/bigdata/ticket/221

- Modified AbstractJournal#dropIndex(name) to conditionally invoke removeAll() on the index in order to reclaim its storage when the index is backed by the RWStore.

Modified Paths:
--------------
    branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java
    branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java

Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java
===================================================================

--- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java	2011-03-03 17:12:31 UTC (rev 4265)
+++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java	2011-03-03 18:38:51 UTC (rev 4266)
@@ -3448,14 +3448,25 @@
 
 	}
 
-	/**
-	 * Drops the named index. The index will no longer participate in atomic
-	 * commits and will not be visible to new transactions. Resources are NOT
-	 * reclaimed on the {@link AbstractJournal} (it is an immortal store) and
-	 * historical states of the index will continue to be accessible.
-	 */
+    /**
+     * Drops the named index. The index will no longer participate in atomic
+     * commits and will not be visible to new transactions.  Storage will be
+     * reclaimed IFF the backing store support that functionality.
+     */
 	public void dropIndex(final String name) {
 
+	    final BTree ndx = getIndex(name);
+	    
+	    if(ndx == null)
+	        throw new NoSuchIndexException(name);
+	    
+	    if(getBufferStrategy() instanceof RWStrategy) {
+            /*
+             * Reclaim storage associated with the index.
+             */
+	        ndx.removeAll();
+	    }
+	    
 		final ReadLock lock = _fieldReadWriteLock.readLock();
 
 		lock.lock();

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java	2011-03-03 17:12:31 UTC (rev 4265)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java	2011-03-03 18:38:51 UTC (rev 4266)
@@ -110,11 +110,15 @@
     
         assertWritable();
         
-        getIndexManager().dropIndex(getNamespace() + "." + NAME_SEARCH);
+        final String name = getNamespace() + "." + NAME_SEARCH;
+        
+        getIndexManager().dropIndex(name);
 
     }
 
-    public void index(int capacity, Iterator<BigdataValue> valuesIterator) {
+    public void index(final int capacity,
+            final Iterator<BigdataValue> valuesIterator) {
+        
         final TokenBuffer buffer = new TokenBuffer(capacity, this);
 
         int n = 0;

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java	2011-03-03 17:12:31 UTC (rev 4265)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java	2011-03-03 18:38:51 UTC (rev 4266)
@@ -68,6 +68,9 @@
      * are tokenized using the default {@link Locale}.
      * </p>
      * 
+     * @param capacity
+     *            A hint to the underlying layer about the buffer size before an
+     *            incremental flush of the index.
      * @param itr
      *            Iterator visiting the terms to be indexed.
      * 

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java	2011-03-03 17:12:31 UTC (rev 4265)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java	2011-03-03 18:38:51 UTC (rev 4266)
@@ -63,21 +63,24 @@
 import com.bigdata.btree.IRangeQuery;
 import com.bigdata.btree.ITuple;
 import com.bigdata.btree.ITupleIterator;
+import com.bigdata.btree.ITupleSerializer;
 import com.bigdata.btree.IndexMetadata;
 import com.bigdata.btree.filter.PrefixFilter;
+import com.bigdata.btree.filter.TupleFilter;
 import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
 import com.bigdata.btree.keys.IKeyBuilder;
 import com.bigdata.btree.keys.KVO;
 import com.bigdata.btree.keys.KeyBuilder;
 import com.bigdata.btree.keys.StrengthEnum;
+import com.bigdata.btree.proc.IResultHandler;
 import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBuffer;
 import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBufferHandler;
 import com.bigdata.btree.proc.BatchLookup.BatchLookupConstructor;
-import com.bigdata.btree.proc.IResultHandler;
 import com.bigdata.btree.raba.IRaba;
 import com.bigdata.cache.ConcurrentWeakValueCacheWithBatchedUpdates;
 import com.bigdata.journal.IIndexManager;
 import com.bigdata.journal.IResourceLock;
+import com.bigdata.journal.ITx;
 import com.bigdata.rawstore.Bytes;
 import com.bigdata.rdf.internal.IDatatypeURIResolver;
 import com.bigdata.rdf.internal.IExtensionFactory;
@@ -88,6 +91,7 @@
 import com.bigdata.rdf.internal.TermId;
 import com.bigdata.rdf.lexicon.Term2IdWriteProc.Term2IdWriteProcConstructor;
 import com.bigdata.rdf.model.BigdataBNode;
+import com.bigdata.rdf.model.BigdataLiteral;
 import com.bigdata.rdf.model.BigdataURI;
 import com.bigdata.rdf.model.BigdataValue;
 import com.bigdata.rdf.model.BigdataValueFactory;
@@ -1619,6 +1623,86 @@
     }
 
     /**
+     * Utility method to (re-)build the full text index. This is a high latency
+     * operation for a database of any significant size. You must be using the
+     * unisolated view of the {@link AbstractTripleStore} for this operation.
+     * {@link AbstractTripleStore.Options#TEXT_INDEX} must be enabled. This
+     * operation is only supported when the {@link ITextIndexer} uses the
+     * {@link FullTextIndex} class.
+     * 
+     * TODO This will have to be redone once we finish
+     * http://sourceforge.net/apps/trac/bigdata/ticket/109 (store large literals
+     * as blobs) since the ID2TERM index will disappear.
+     */
+    @SuppressWarnings("unchecked")
+    public void rebuildTextIndex() {
+
+        if (getTimestamp() != ITx.UNISOLATED)
+            throw new UnsupportedOperationException();
+        
+        if(!textIndex)
+            throw new UnsupportedOperationException();
+        
+        final ITextIndexer textIndexer = getSearchEngine();
+        
+        if (textIndexer == null) {
+            throw new UnsupportedOperationException();
+        }
+
+        // destroy the existing text index.
+        textIndexer.destroy();
+        
+        // create a new index.
+        textIndexer.create();
+
+        // the index to scan for the RDF Literals.
+        final IIndex id2term = getId2TermIndex();
+
+        // used to decode the
+        final ITupleSerializer tupSer = id2term.getIndexMetadata()
+                .getTupleSerializer();
+
+        /*
+         * Visit all plain, language code, and datatype literals in the lexicon.
+         * 
+         * Note: This uses a filter on the ITupleIterator in order to filter out
+         * non-literal terms before they are shipped from a remote index shard.
+         */
+        final Iterator<BigdataValue> itr = new Striterator(id2term
+                .rangeIterator(null/* fromKey */, null/* toKey */,
+                        0/* capacity */, IRangeQuery.DEFAULT,
+                        new TupleFilter<BigdataValue>() {
+                            private static final long serialVersionUID = 1L;
+                            protected boolean isValid(
+                                    final ITuple<BigdataValue> obj) {
+                                final IV iv = (IV) tupSer.deserializeKey(obj);
+                                if (!iv.isInline() && iv.isLiteral()) {
+                                    return true;
+                                }
+                                return false;
+                            }
+                        })).addFilter(new Resolver() {
+            private static final long serialVersionUID = 1L;
+
+            protected Object resolve(final Object obj) {
+                final BigdataLiteral lit = (BigdataLiteral) tupSer
+                        .deserialize((ITuple) obj);
+//                System.err.println("lit: "+lit);
+                return lit;
+            }
+        });
+
+        final int capacity = 10000;
+
+        while (itr.hasNext()) {
+
+            indexTermText(capacity, itr);
+
+        }
+
+    }
+    
+    /**
      * Batch resolution of internal values to {@link BigdataValue}s.
      * 
      * @param ivs

Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java
===================================================================
--- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java	2011-03-03 17:12:31 UTC (rev 4265)
+++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java	2011-03-03 18:38:51 UTC (rev 4266)
@@ -362,5 +362,161 @@
         }
         
     }
+
+    /**
+     * Unit test for {@link LexiconRelation#rebuildTextIndex()}.
+     */
+    public void test_rebuildIndex() {
+        
+        AbstractTripleStore store = getStore();
+
+        try {
+
+            assertNotNull(store.getLexiconRelation().getSearchEngine());
+
+            final BigdataValueFactory f = store.getValueFactory();
+            
+            final BigdataValue[] terms = new BigdataValue[] {//
+                    f.createLiteral("abc"),//
+                    f.createLiteral("abc", "en"),//
+                    f.createLiteral("good day", "en"),//
+                    f.createLiteral("gutten tag", "de"),//
+                    f.createLiteral("tag team", "en"),//
+                    f.createLiteral("the first day", "en"),// // 'the' is a stopword.
+
+                    f.createURI("http://www.bigdata.com"),//
+                    f.asValue(RDF.TYPE),//
+                    f.asValue(RDFS.SUBCLASSOF),//
+                    f.asValue(XMLSchema.DECIMAL),//
+
+                    f.createBNode(UUID.randomUUID().toString()),//
+                    f.createBNode("a12"),//
+            };
+
+            store.addTerms(terms);
+
+            if(log.isInfoEnabled()) {
+                log.info(store.getLexiconRelation().dumpTerms());
+            }
+
+            /*
+             * Note: the language code is only used when tokenizing literals. It
+             * IS NOT applied as a filter to the recovered literals.
+             */
+            
+            assertExpectedHits(store, "abc", null/* languageCode */,
+                    new BigdataValue[] { //
+                    f.createLiteral("abc"),//
+                            f.createLiteral("abc", "en") //
+                    });
+
+            assertExpectedHits(store, "tag", "en", new BigdataValue[] {//
+                    f.createLiteral("gutten tag", "de"), //
+                    f.createLiteral("tag team", "en") //
+                    });
+
+            assertExpectedHits(store, "tag", "de", new BigdataValue[] {//
+                    f.createLiteral("gutten tag", "de"), //
+                    f.createLiteral("tag team", "en") //
+                    });
+
+            assertExpectedHits(store, "GOOD DAY", "en", //
+                    .0f, // minCosine
+                    new BigdataValue[] {//
+                    f.createLiteral("good day", "en"), //
+                    f.createLiteral("the first day", "en") //
+                    });
+
+            assertExpectedHits(store, "GOOD DAY", "en", //
+                    .4f, // minCosine
+                    new BigdataValue[] {//
+                    f.createLiteral("good day", "en"), //
+                    });
+
+            assertExpectedHits(store, "day", "en", //
+                    .0f, // minCosine
+                    new BigdataValue[] {
+                    f.createLiteral("good day", "en"),
+                    f.createLiteral("the first day", "en") });
+
+            // 'the' is a stopword, so there are no hits.
+            assertExpectedHits(store, "the", "en", new BigdataValue[] {});
+
+            /*
+             * re-open the store before search to verify that the data were made
+             * restart safe.
+             */
+            if (store.isStable()) {
+
+                store.commit();
+
+                store = reopenStore(store);
+
+            }
+
+            // rebuild the full text index.
+            store.getLexiconRelation().rebuildTextIndex();
+            
+            /*
+             * re-open the store before search to verify that the data were made
+             * restart safe.
+             */
+            if (store.isStable()) {
+
+                store.commit();
+
+                store = reopenStore(store);
+
+            }
+            
+            // re-verify the full text index.
+            {
+
+                assertNotNull(store.getLexiconRelation().getSearchEngine());
+                
+                assertExpectedHits(store, "abc", null/* languageCode */,
+                        new BigdataValue[] { //
+                        f.createLiteral("abc"),//
+                                f.createLiteral("abc", "en") //
+                        });
+
+                assertExpectedHits(store, "tag", "en", new BigdataValue[] {//
+                        f.createLiteral("gutten tag", "de"), //
+                        f.createLiteral("tag team", "en") //
+                        });
+
+                assertExpectedHits(store, "tag", "de", new BigdataValue[] {//
+                        f.createLiteral("gutten tag", "de"), //
+                        f.createLiteral("tag team", "en") //
+                        });
+
+                assertExpectedHits(store, "GOOD DAY", "en", //
+                        .0f, // minCosine
+                        new BigdataValue[] {//
+                        f.createLiteral("good day", "en"), //
+                        f.createLiteral("the first day", "en") //
+                        });
+
+                assertExpectedHits(store, "GOOD DAY", "en", //
+                        .4f, // minCosine
+                        new BigdataValue[] {//
+                        f.createLiteral("good day", "en"), //
+                        });
+
+                assertExpectedHits(store, "day", "en", //
+                        .0f, // minCosine
+                        new BigdataValue[] {
+                        f.createLiteral("good day", "en"),
+                        f.createLiteral("the first day", "en") });
+                
+            }
+            
+        } finally {
+
+            store.__tearDownUnitTest();
+
+        }
+
+    }
     
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Bigdata-commit] SF.net SVN: bigdata:[4266] branches/QUADS_QUERY_BRANCH/bigdata

Fast, scalable, robust graph database platform

[Bigdata-commit] SF.net SVN: bigdata:[4266] branches/QUADS_QUERY_BRANCH/bigdata