From: Bryan T. <tho...@us...> - 2007-02-17 21:35:01
|
Update of /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf In directory sc8-pr-cvs4.sourceforge.net:/tmp/cvs-serv12912/src/java/com/bigdata/rdf Modified Files: TempTripleStore.java TripleStore.java Log Message: Working through transactional isolation. Index: TempTripleStore.java =================================================================== RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/TempTripleStore.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** TempTripleStore.java 17 Feb 2007 03:08:00 -0000 1.1 --- TempTripleStore.java 17 Feb 2007 21:34:57 -0000 1.2 *************** *** 48,69 **** package com.bigdata.rdf; - import java.io.BufferedReader; - import java.io.File; - import java.io.FileInputStream; - import java.io.IOException; - import java.io.InputStreamReader; - import java.io.Reader; - import java.util.Arrays; import java.util.Locale; - import java.util.Properties; import org.apache.log4j.Logger; - import org.openrdf.model.Resource; - import org.openrdf.model.URI; - import org.openrdf.model.Value; - import com.bigdata.journal.IJournal; - import com.bigdata.journal.Journal; - import com.bigdata.journal.RootBlockView; import com.bigdata.journal.TemporaryStore; import com.bigdata.objndx.BTree; --- 48,55 ---- *************** *** 73,214 **** import com.bigdata.rawstore.Bytes; import com.bigdata.rdf.inf.SPO; - import com.bigdata.rdf.model.OptimizedValueFactory.OSPComparator; - import com.bigdata.rdf.model.OptimizedValueFactory.POSComparator; - import com.bigdata.rdf.model.OptimizedValueFactory.SPOComparator; - import com.bigdata.rdf.model.OptimizedValueFactory.TermIdComparator; import com.bigdata.rdf.model.OptimizedValueFactory._Statement; - import com.bigdata.rdf.model.OptimizedValueFactory._Value; - import com.bigdata.rdf.model.OptimizedValueFactory._ValueSortKeyComparator; - import com.bigdata.rdf.rio.IRioLoader; - import com.bigdata.rdf.rio.PresortRioLoader; - import com.bigdata.rdf.rio.RioLoaderEvent; - import com.bigdata.rdf.rio.RioLoaderListener; - import com.bigdata.rdf.serializers.RdfValueSerializer; import com.bigdata.rdf.serializers.StatementSerializer; - import com.bigdata.rdf.serializers.TermIdSerializer; - import com.bigdata.scaleup.PartitionedIndex; - import com.bigdata.scaleup.PartitionedJournal; - import com.bigdata.scaleup.SlaveJournal; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; /** ! * A triple store based on the <em>bigdata</em> architecture. ! * ! * @todo Refactor to use a delegation mechanism so that we can run with or ! * without partitioned indices? (All you have to do now is change the ! * class that is being extended from Journal to PartitionedJournal and ! * handle some different initialization properties.) ! * ! * @todo Play with the branching factor again. Now that we are using overflow ! * to evict data onto index segments we can use a higher branching factor ! * and simply evict more often. Is this worth it? We might want a lower ! * branching factor on the journal since we can not tell how large any ! * given write will be and then use larger branching factors on the index ! * segments. ! * ! * @todo try loading some very large data sets; try Transient vs Disk vs Direct ! * modes. If Transient has significantly better performance then it ! * indicates that we are waiting on IO so introduce AIO support in the ! * Journal and try Disk vs Direct with aio. Otherwise, consider ! * refactoring the btree to have the values be variable length byte[]s ! * with serialization in the client and other tuning focused on IO (the ! * only questions with that approach are appropriate compression ! * techniques and handling transparently timestamps as part of the value ! * when using an isolated btree in a transaction). ! * ! * @todo the only added cost for a quad store is the additional statement ! * indices. There are only three more statement indices in a quad store. ! * Since statement indices are so cheap, it is probably worth implementing ! * them now, even if only as a configuration option. ! * ! * @todo verify read after commit (restart safe) for large data sets and test ! * re-load rate for a data set and verify that no new statements are ! * added. ! * ! * @todo add bulk data export (buffering statements and bulk resolving term ! * identifiers). ! * ! * @todo The use of long[] identifiers for statements also means that the SPO ! * and other statement indices are only locally ordered so they can not be ! * used to perform a range scan that is ordered in the terms without ! * joining against the various term indices and then sorting the outputs. ! * ! * @todo possibly save frequently seen terms in each batch for the next batch in ! * order to reduce unicode conversions. ! * ! * @todo support metadata about the statement, e.g., whether or not it is an ! * inference. ! * ! * @todo compute the MB/sec rate at which the store can load data and compare it ! * with the maximum transfer rate for the journal without the btree and ! * the maximum transfer rate to disk. this will tell us the overhead of ! * the btree implementation. ! * ! * @todo Try a variant in which we have metadata linking statements and terms ! * together. In this case we would have to go back to the terms and update ! * them to have metadata about the statement. it is a bit circular since ! * we can not create the statement until we have the terms and we can not ! * add the metadata to the terms until we have the statement. ! * ! * @todo Note that a very interesting solution for RDF places all data into a ! * statement index and then use block compression techniques to remove ! * frequent terms, e.g., the repeated parts of the value. Also note that ! * there will be no "value" for an rdf statement since existence is all. ! * The key completely encodes the statement. So, another approach is to ! * bit code the repeated substrings found within the key in each leaf. ! * This way the serialized key size reflects only the #of distinctions. ! * ! * @todo I've been thinking about rdfs stores in the light of the work on ! * bigdata. Transactional isolation for rdf is really quite simple. Since ! * lexicons (uri, literal or bnode indices) do not (really) support ! * deletion, the only acts are asserting term and asserting and retracting ! * statements. since assertion terms can lead to write-write conflicts, ! * which must be resolved and can cascade into the statement indices since ! * the statement key depends directly on the assigned term identifiers. a ! * statement always merges with an existing statement, inserts never cause ! * conflicts. Hence the only possible write-write conflict for the ! * statement indices is a write-delete conflict. quads do not really make ! * this more complex (or expensive) since merges only occur when there is ! * a context match. however entailments can cause twists depending on how ! * they are realized. ! * ! * If we do a pure RDF layer (vs RDF over GOM over bigdata), then it seems that ! * we could simple use a statement index (no lexicons for URIs, etc). Normally ! * this inflates the index size since you have lots of duplicate strings, but we ! * could just use block compression to factor out those strings when we evict ! * index leaves to disk. Prefix compression of keys will already do great things ! * for removing repetitive strings from the index nodes and block compression ! * will get at the leftover redundancy. ! * ! * So, one dead simple architecture is one index per access path (there is of ! * course some index reuse across the access paths) with the statements inline ! * in the index using prefix key compression and block compression to remove ! * redundancy. Inserts on this architecture would just send triples to the store ! * and the various indices would be maintained by the store itself. Those ! * indices could be load balanced in segments across a cluster. ! * ! * Since a read that goes through to disk reads an entire leaf at a time, the ! * most obvious drawback that I see is caching for commonly used assertions, but ! * that is easy to implement with some cache invalidation mechanism coupled to ! * deletes. ! * ! * I can also see how to realize very large bulk inserts outside of a ! * transactional context while handling concurrent transactions -- you just have ! * to reconcile as of the commit time of the bulk insert and you get to do that ! * using efficient compacting sort-merges of "perfect" bulk index segments. The ! * architecture would perform well on concurrent apstars style document loading ! * as well as what we might normally consider a bulk load (a few hundred ! * megabytes of data) within the normal transaction mechanisms, but if you ! * needed to ingest uniprot you would want to use a different technique :-) ! * outside of the normal transactional isolation mechanisms. * ! * I'm not sure what the right solution is for entailments, e.g., truth ! * maintenance vs eager closure. Either way, you would definitely want to avoid ! * tuple at a time processing and batch things up so as to minimize the #of ! * index tests that you had to do. So, handling entailments and efficient joins ! * for high-level query languages would be the two places for more thought. And ! * there are little odd spots in RDF - handling bnodes, typed literals, and the ! * concept of a total sort order for the statement index. * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> --- 59,73 ---- import com.bigdata.rawstore.Bytes; import com.bigdata.rdf.inf.SPO; import com.bigdata.rdf.model.OptimizedValueFactory._Statement; import com.bigdata.rdf.serializers.StatementSerializer; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; /** ! * A temporary triple store based on the <em>bigdata</em> architecture. Data ! * is buffered in memory but will overflow to disk for large stores. * ! * @todo refactor to use a delegate pattern so that we can share code with ! * {@link TripleStore} while deriving from a different base class. * * @author <a href="mailto:tho...@us...">Bryan Thompson</a> Index: TripleStore.java =================================================================== RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/TripleStore.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** TripleStore.java 12 Feb 2007 21:51:43 -0000 1.15 --- TripleStore.java 17 Feb 2007 21:34:57 -0000 1.16 *************** *** 63,69 **** import org.openrdf.model.Value; import com.bigdata.journal.IJournal; import com.bigdata.journal.Journal; - import com.bigdata.journal.RootBlockView; import com.bigdata.objndx.BTree; import com.bigdata.objndx.IIndex; --- 63,69 ---- import org.openrdf.model.Value; + import com.bigdata.journal.ICommitRecord; import com.bigdata.journal.IJournal; import com.bigdata.journal.Journal; import com.bigdata.objndx.BTree; import com.bigdata.objndx.IIndex; *************** *** 85,89 **** import com.bigdata.rdf.serializers.TermIdSerializer; import com.bigdata.scaleup.PartitionedIndex; - import com.bigdata.scaleup.PartitionedJournal; import com.bigdata.scaleup.SlaveJournal; import com.ibm.icu.text.Collator; --- 85,88 ---- *************** *** 220,224 **** * by the store. */ ! static public transient final int ROOT_COUNTER = RootBlockView.FIRST_USER_ROOT + 5; public RdfKeyBuilder keyBuilder; --- 219,223 ---- * by the store. */ ! static public transient final int ROOT_COUNTER = ICommitRecord.FIRST_USER_ROOT; public RdfKeyBuilder keyBuilder; *************** *** 349,353 **** long addr; ! if ((addr = getAddr(ROOT_COUNTER)) == 0L) { // Note: first termId is ONE (1). Zero is reserved. --- 348,352 ---- long addr; ! if ((addr = getRootAddr(ROOT_COUNTER)) == 0L) { // Note: first termId is ONE (1). Zero is reserved. *************** *** 1021,1029 **** } ! public void commit() { final long begin = System.currentTimeMillis(); ! super.commit(); final long elapsed = System.currentTimeMillis() - begin; --- 1020,1028 ---- } ! public long commit() { final long begin = System.currentTimeMillis(); ! final long commitTime = super.commit(); final long elapsed = System.currentTimeMillis() - begin; *************** *** 1033,1036 **** --- 1032,1037 ---- usage(); + return commitTime; + } |