From: <tho...@us...> - 2011-03-03 18:38:58
|
Revision: 4266 http://bigdata.svn.sourceforge.net/bigdata/?rev=4266&view=rev Author: thompsonbry Date: 2011-03-03 18:38:51 +0000 (Thu, 03 Mar 2011) Log Message: ----------- https://sourceforge.net/apps/trac/bigdata/ticket/265 - Added a method to re-build the full text index and a unit test for the same (see TestFullTextIndex). - Javadoc edits to the ITextIndex interface. https://sourceforge.net/apps/trac/bigdata/ticket/221 - Modified AbstractJournal#dropIndex(name) to conditionally invoke removeAll() on the index in order to reclaim its storage when the index is backed by the RWStore. Modified Paths: -------------- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java Modified: branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java 2011-03-03 17:12:31 UTC (rev 4265) +++ branches/QUADS_QUERY_BRANCH/bigdata/src/java/com/bigdata/journal/AbstractJournal.java 2011-03-03 18:38:51 UTC (rev 4266) @@ -3448,14 +3448,25 @@ } - /** - * Drops the named index. The index will no longer participate in atomic - * commits and will not be visible to new transactions. Resources are NOT - * reclaimed on the {@link AbstractJournal} (it is an immortal store) and - * historical states of the index will continue to be accessible. - */ + /** + * Drops the named index. The index will no longer participate in atomic + * commits and will not be visible to new transactions. Storage will be + * reclaimed IFF the backing store support that functionality. + */ public void dropIndex(final String name) { + final BTree ndx = getIndex(name); + + if(ndx == null) + throw new NoSuchIndexException(name); + + if(getBufferStrategy() instanceof RWStrategy) { + /* + * Reclaim storage associated with the index. + */ + ndx.removeAll(); + } + final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java 2011-03-03 17:12:31 UTC (rev 4265) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/BigdataRDFFullTextIndex.java 2011-03-03 18:38:51 UTC (rev 4266) @@ -110,11 +110,15 @@ assertWritable(); - getIndexManager().dropIndex(getNamespace() + "." + NAME_SEARCH); + final String name = getNamespace() + "." + NAME_SEARCH; + + getIndexManager().dropIndex(name); } - public void index(int capacity, Iterator<BigdataValue> valuesIterator) { + public void index(final int capacity, + final Iterator<BigdataValue> valuesIterator) { + final TokenBuffer buffer = new TokenBuffer(capacity, this); int n = 0; Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-03-03 17:12:31 UTC (rev 4265) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/ITextIndexer.java 2011-03-03 18:38:51 UTC (rev 4266) @@ -68,6 +68,9 @@ * are tokenized using the default {@link Locale}. * </p> * + * @param capacity + * A hint to the underlying layer about the buffer size before an + * incremental flush of the index. * @param itr * Iterator visiting the terms to be indexed. * Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java 2011-03-03 17:12:31 UTC (rev 4265) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/java/com/bigdata/rdf/lexicon/LexiconRelation.java 2011-03-03 18:38:51 UTC (rev 4266) @@ -63,21 +63,24 @@ import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; +import com.bigdata.btree.ITupleSerializer; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.filter.PrefixFilter; +import com.bigdata.btree.filter.TupleFilter; import com.bigdata.btree.keys.DefaultKeyBuilderFactory; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.KVO; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.keys.StrengthEnum; +import com.bigdata.btree.proc.IResultHandler; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBuffer; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBufferHandler; import com.bigdata.btree.proc.BatchLookup.BatchLookupConstructor; -import com.bigdata.btree.proc.IResultHandler; import com.bigdata.btree.raba.IRaba; import com.bigdata.cache.ConcurrentWeakValueCacheWithBatchedUpdates; import com.bigdata.journal.IIndexManager; import com.bigdata.journal.IResourceLock; +import com.bigdata.journal.ITx; import com.bigdata.rawstore.Bytes; import com.bigdata.rdf.internal.IDatatypeURIResolver; import com.bigdata.rdf.internal.IExtensionFactory; @@ -88,6 +91,7 @@ import com.bigdata.rdf.internal.TermId; import com.bigdata.rdf.lexicon.Term2IdWriteProc.Term2IdWriteProcConstructor; import com.bigdata.rdf.model.BigdataBNode; +import com.bigdata.rdf.model.BigdataLiteral; import com.bigdata.rdf.model.BigdataURI; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.model.BigdataValueFactory; @@ -1619,6 +1623,86 @@ } /** + * Utility method to (re-)build the full text index. This is a high latency + * operation for a database of any significant size. You must be using the + * unisolated view of the {@link AbstractTripleStore} for this operation. + * {@link AbstractTripleStore.Options#TEXT_INDEX} must be enabled. This + * operation is only supported when the {@link ITextIndexer} uses the + * {@link FullTextIndex} class. + * + * TODO This will have to be redone once we finish + * http://sourceforge.net/apps/trac/bigdata/ticket/109 (store large literals + * as blobs) since the ID2TERM index will disappear. + */ + @SuppressWarnings("unchecked") + public void rebuildTextIndex() { + + if (getTimestamp() != ITx.UNISOLATED) + throw new UnsupportedOperationException(); + + if(!textIndex) + throw new UnsupportedOperationException(); + + final ITextIndexer textIndexer = getSearchEngine(); + + if (textIndexer == null) { + throw new UnsupportedOperationException(); + } + + // destroy the existing text index. + textIndexer.destroy(); + + // create a new index. + textIndexer.create(); + + // the index to scan for the RDF Literals. + final IIndex id2term = getId2TermIndex(); + + // used to decode the + final ITupleSerializer tupSer = id2term.getIndexMetadata() + .getTupleSerializer(); + + /* + * Visit all plain, language code, and datatype literals in the lexicon. + * + * Note: This uses a filter on the ITupleIterator in order to filter out + * non-literal terms before they are shipped from a remote index shard. + */ + final Iterator<BigdataValue> itr = new Striterator(id2term + .rangeIterator(null/* fromKey */, null/* toKey */, + 0/* capacity */, IRangeQuery.DEFAULT, + new TupleFilter<BigdataValue>() { + private static final long serialVersionUID = 1L; + protected boolean isValid( + final ITuple<BigdataValue> obj) { + final IV iv = (IV) tupSer.deserializeKey(obj); + if (!iv.isInline() && iv.isLiteral()) { + return true; + } + return false; + } + })).addFilter(new Resolver() { + private static final long serialVersionUID = 1L; + + protected Object resolve(final Object obj) { + final BigdataLiteral lit = (BigdataLiteral) tupSer + .deserialize((ITuple) obj); +// System.err.println("lit: "+lit); + return lit; + } + }); + + final int capacity = 10000; + + while (itr.hasNext()) { + + indexTermText(capacity, itr); + + } + + } + + /** * Batch resolution of internal values to {@link BigdataValue}s. * * @param ivs Modified: branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java =================================================================== --- branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java 2011-03-03 17:12:31 UTC (rev 4265) +++ branches/QUADS_QUERY_BRANCH/bigdata-rdf/src/test/com/bigdata/rdf/lexicon/TestFullTextIndex.java 2011-03-03 18:38:51 UTC (rev 4266) @@ -362,5 +362,161 @@ } } + + /** + * Unit test for {@link LexiconRelation#rebuildTextIndex()}. + */ + public void test_rebuildIndex() { + + AbstractTripleStore store = getStore(); + + try { + + assertNotNull(store.getLexiconRelation().getSearchEngine()); + + final BigdataValueFactory f = store.getValueFactory(); + + final BigdataValue[] terms = new BigdataValue[] {// + f.createLiteral("abc"),// + f.createLiteral("abc", "en"),// + f.createLiteral("good day", "en"),// + f.createLiteral("gutten tag", "de"),// + f.createLiteral("tag team", "en"),// + f.createLiteral("the first day", "en"),// // 'the' is a stopword. + + f.createURI("http://www.bigdata.com"),// + f.asValue(RDF.TYPE),// + f.asValue(RDFS.SUBCLASSOF),// + f.asValue(XMLSchema.DECIMAL),// + + f.createBNode(UUID.randomUUID().toString()),// + f.createBNode("a12"),// + }; + + store.addTerms(terms); + + if(log.isInfoEnabled()) { + log.info(store.getLexiconRelation().dumpTerms()); + } + + /* + * Note: the language code is only used when tokenizing literals. It + * IS NOT applied as a filter to the recovered literals. + */ + + assertExpectedHits(store, "abc", null/* languageCode */, + new BigdataValue[] { // + f.createLiteral("abc"),// + f.createLiteral("abc", "en") // + }); + + assertExpectedHits(store, "tag", "en", new BigdataValue[] {// + f.createLiteral("gutten tag", "de"), // + f.createLiteral("tag team", "en") // + }); + + assertExpectedHits(store, "tag", "de", new BigdataValue[] {// + f.createLiteral("gutten tag", "de"), // + f.createLiteral("tag team", "en") // + }); + + assertExpectedHits(store, "GOOD DAY", "en", // + .0f, // minCosine + new BigdataValue[] {// + f.createLiteral("good day", "en"), // + f.createLiteral("the first day", "en") // + }); + + assertExpectedHits(store, "GOOD DAY", "en", // + .4f, // minCosine + new BigdataValue[] {// + f.createLiteral("good day", "en"), // + }); + + assertExpectedHits(store, "day", "en", // + .0f, // minCosine + new BigdataValue[] { + f.createLiteral("good day", "en"), + f.createLiteral("the first day", "en") }); + + // 'the' is a stopword, so there are no hits. + assertExpectedHits(store, "the", "en", new BigdataValue[] {}); + + /* + * re-open the store before search to verify that the data were made + * restart safe. + */ + if (store.isStable()) { + + store.commit(); + + store = reopenStore(store); + + } + + // rebuild the full text index. + store.getLexiconRelation().rebuildTextIndex(); + + /* + * re-open the store before search to verify that the data were made + * restart safe. + */ + if (store.isStable()) { + + store.commit(); + + store = reopenStore(store); + + } + + // re-verify the full text index. + { + + assertNotNull(store.getLexiconRelation().getSearchEngine()); + + assertExpectedHits(store, "abc", null/* languageCode */, + new BigdataValue[] { // + f.createLiteral("abc"),// + f.createLiteral("abc", "en") // + }); + + assertExpectedHits(store, "tag", "en", new BigdataValue[] {// + f.createLiteral("gutten tag", "de"), // + f.createLiteral("tag team", "en") // + }); + + assertExpectedHits(store, "tag", "de", new BigdataValue[] {// + f.createLiteral("gutten tag", "de"), // + f.createLiteral("tag team", "en") // + }); + + assertExpectedHits(store, "GOOD DAY", "en", // + .0f, // minCosine + new BigdataValue[] {// + f.createLiteral("good day", "en"), // + f.createLiteral("the first day", "en") // + }); + + assertExpectedHits(store, "GOOD DAY", "en", // + .4f, // minCosine + new BigdataValue[] {// + f.createLiteral("good day", "en"), // + }); + + assertExpectedHits(store, "day", "en", // + .0f, // minCosine + new BigdataValue[] { + f.createLiteral("good day", "en"), + f.createLiteral("the first day", "en") }); + + } + + } finally { + + store.__tearDownUnitTest(); + + } + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |