[cweb-CVS] bigdata-rdf/src/java/com/bigdata/rdf TempTripleStore.java, 1.1, 1.2 TripleStore.java, 1.

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf
In directory sc8-pr-cvs4.sourceforge.net:/tmp/cvs-serv12912/src/java/com/bigdata/rdf

Modified Files:
	TempTripleStore.java TripleStore.java 
Log Message:
Working through transactional isolation.

Index: TempTripleStore.java
===================================================================
RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/TempTripleStore.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** TempTripleStore.java	17 Feb 2007 03:08:00 -0000	1.1
--- TempTripleStore.java	17 Feb 2007 21:34:57 -0000	1.2
***************
*** 48,69 ****
  package com.bigdata.rdf;

- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.util.Arrays;
  import java.util.Locale;
- import java.util.Properties;

  import org.apache.log4j.Logger;
- import org.openrdf.model.Resource;
- import org.openrdf.model.URI;
- import org.openrdf.model.Value;

- import com.bigdata.journal.IJournal;
- import com.bigdata.journal.Journal;
- import com.bigdata.journal.RootBlockView;
  import com.bigdata.journal.TemporaryStore;
  import com.bigdata.objndx.BTree;
--- 48,55 ----
***************
*** 73,214 ****
  import com.bigdata.rawstore.Bytes;
  import com.bigdata.rdf.inf.SPO;
- import com.bigdata.rdf.model.OptimizedValueFactory.OSPComparator;
- import com.bigdata.rdf.model.OptimizedValueFactory.POSComparator;
- import com.bigdata.rdf.model.OptimizedValueFactory.SPOComparator;
- import com.bigdata.rdf.model.OptimizedValueFactory.TermIdComparator;
  import com.bigdata.rdf.model.OptimizedValueFactory._Statement;
- import com.bigdata.rdf.model.OptimizedValueFactory._Value;
- import com.bigdata.rdf.model.OptimizedValueFactory._ValueSortKeyComparator;
- import com.bigdata.rdf.rio.IRioLoader;
- import com.bigdata.rdf.rio.PresortRioLoader;
- import com.bigdata.rdf.rio.RioLoaderEvent;
- import com.bigdata.rdf.rio.RioLoaderListener;
- import com.bigdata.rdf.serializers.RdfValueSerializer;
  import com.bigdata.rdf.serializers.StatementSerializer;
- import com.bigdata.rdf.serializers.TermIdSerializer;
- import com.bigdata.scaleup.PartitionedIndex;
- import com.bigdata.scaleup.PartitionedJournal;
- import com.bigdata.scaleup.SlaveJournal;
  import com.ibm.icu.text.Collator;
  import com.ibm.icu.text.RuleBasedCollator;

  /**
!  * A triple store based on the <em>bigdata</em> architecture.
!  * 
!  * @todo Refactor to use a delegation mechanism so that we can run with or
!  *       without partitioned indices? (All you have to do now is change the
!  *       class that is being extended from Journal to PartitionedJournal and
!  *       handle some different initialization properties.)
!  * 
!  * @todo Play with the branching factor again.  Now that we are using overflow
!  *       to evict data onto index segments we can use a higher branching factor
!  *       and simply evict more often.  Is this worth it?  We might want a lower
!  *       branching factor on the journal since we can not tell how large any
!  *       given write will be and then use larger branching factors on the index
!  *       segments.
!  * 
!  * @todo try loading some very large data sets; try Transient vs Disk vs Direct
!  *       modes. If Transient has significantly better performance then it
!  *       indicates that we are waiting on IO so introduce AIO support in the
!  *       Journal and try Disk vs Direct with aio. Otherwise, consider
!  *       refactoring the btree to have the values be variable length byte[]s
!  *       with serialization in the client and other tuning focused on IO (the
!  *       only questions with that approach are appropriate compression
!  *       techniques and handling transparently timestamps as part of the value
!  *       when using an isolated btree in a transaction).
!  * 
!  * @todo the only added cost for a quad store is the additional statement
!  *       indices. There are only three more statement indices in a quad store.
!  *       Since statement indices are so cheap, it is probably worth implementing
!  *       them now, even if only as a configuration option.
!  * 
!  * @todo verify read after commit (restart safe) for large data sets and test
!  *       re-load rate for a data set and verify that no new statements are
!  *       added.
!  * 
!  * @todo add bulk data export (buffering statements and bulk resolving term
!  *       identifiers).
!  * 
!  * @todo The use of long[] identifiers for statements also means that the SPO
!  *       and other statement indices are only locally ordered so they can not be
!  *       used to perform a range scan that is ordered in the terms without
!  *       joining against the various term indices and then sorting the outputs.
!  * 
!  * @todo possibly save frequently seen terms in each batch for the next batch in
!  *       order to reduce unicode conversions.
!  * 
!  * @todo support metadata about the statement, e.g., whether or not it is an
!  *       inference.
!  * 
!  * @todo compute the MB/sec rate at which the store can load data and compare it
!  *       with the maximum transfer rate for the journal without the btree and
!  *       the maximum transfer rate to disk. this will tell us the overhead of
!  *       the btree implementation.
!  * 
!  * @todo Try a variant in which we have metadata linking statements and terms
!  *       together. In this case we would have to go back to the terms and update
!  *       them to have metadata about the statement. it is a bit circular since
!  *       we can not create the statement until we have the terms and we can not
!  *       add the metadata to the terms until we have the statement.
!  * 
!  * @todo Note that a very interesting solution for RDF places all data into a
!  *       statement index and then use block compression techniques to remove
!  *       frequent terms, e.g., the repeated parts of the value. Also note that
!  *       there will be no "value" for an rdf statement since existence is all.
!  *       The key completely encodes the statement. So, another approach is to
!  *       bit code the repeated substrings found within the key in each leaf.
!  *       This way the serialized key size reflects only the #of distinctions.
!  * 
!  * @todo I've been thinking about rdfs stores in the light of the work on
!  *       bigdata. Transactional isolation for rdf is really quite simple. Since
!  *       lexicons (uri, literal or bnode indices) do not (really) support
!  *       deletion, the only acts are asserting term and asserting and retracting
!  *       statements. since assertion terms can lead to write-write conflicts,
!  *       which must be resolved and can cascade into the statement indices since
!  *       the statement key depends directly on the assigned term identifiers. a
!  *       statement always merges with an existing statement, inserts never cause
!  *       conflicts. Hence the only possible write-write conflict for the
!  *       statement indices is a write-delete conflict. quads do not really make
!  *       this more complex (or expensive) since merges only occur when there is
!  *       a context match. however entailments can cause twists depending on how
!  *       they are realized.
!  * 
!  * If we do a pure RDF layer (vs RDF over GOM over bigdata), then it seems that
!  * we could simple use a statement index (no lexicons for URIs, etc). Normally
!  * this inflates the index size since you have lots of duplicate strings, but we
!  * could just use block compression to factor out those strings when we evict
!  * index leaves to disk. Prefix compression of keys will already do great things
!  * for removing repetitive strings from the index nodes and block compression
!  * will get at the leftover redundancy.
!  * 
!  * So, one dead simple architecture is one index per access path (there is of
!  * course some index reuse across the access paths) with the statements inline
!  * in the index using prefix key compression and block compression to remove
!  * redundancy. Inserts on this architecture would just send triples to the store
!  * and the various indices would be maintained by the store itself. Those
!  * indices could be load balanced in segments across a cluster.
!  * 
!  * Since a read that goes through to disk reads an entire leaf at a time, the
!  * most obvious drawback that I see is caching for commonly used assertions, but
!  * that is easy to implement with some cache invalidation mechanism coupled to
!  * deletes.
!  * 
!  * I can also see how to realize very large bulk inserts outside of a
!  * transactional context while handling concurrent transactions -- you just have
!  * to reconcile as of the commit time of the bulk insert and you get to do that
!  * using efficient compacting sort-merges of "perfect" bulk index segments. The
!  * architecture would perform well on concurrent apstars style document loading
!  * as well as what we might normally consider a bulk load (a few hundred
!  * megabytes of data) within the normal transaction mechanisms, but if you
!  * needed to ingest uniprot you would want to use a different technique :-)
!  * outside of the normal transactional isolation mechanisms.
   * 
!  * I'm not sure what the right solution is for entailments, e.g., truth
!  * maintenance vs eager closure. Either way, you would definitely want to avoid
!  * tuple at a time processing and batch things up so as to minimize the #of
!  * index tests that you had to do. So, handling entailments and efficient joins
!  * for high-level query languages would be the two places for more thought. And
!  * there are little odd spots in RDF - handling bnodes, typed literals, and the
!  * concept of a total sort order for the statement index.
   * 
   * @author <a href="mailto:tho...@us...">Bryan Thompson</a>
--- 59,73 ----
  import com.bigdata.rawstore.Bytes;
  import com.bigdata.rdf.inf.SPO;
  import com.bigdata.rdf.model.OptimizedValueFactory._Statement;
  import com.bigdata.rdf.serializers.StatementSerializer;
  import com.ibm.icu.text.Collator;
  import com.ibm.icu.text.RuleBasedCollator;

  /**
!  * A temporary triple store based on the <em>bigdata</em> architecture.  Data
!  * is buffered in memory but will overflow to disk for large stores.
   * 
!  * @todo refactor to use a delegate pattern so that we can share code with
!  *       {@link TripleStore} while deriving from a different base class.
   * 
   * @author <a href="mailto:tho...@us...">Bryan Thompson</a>

Index: TripleStore.java
===================================================================
RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/TripleStore.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** TripleStore.java	12 Feb 2007 21:51:43 -0000	1.15
--- TripleStore.java	17 Feb 2007 21:34:57 -0000	1.16
***************
*** 63,69 ****
  import org.openrdf.model.Value;

  import com.bigdata.journal.IJournal;
  import com.bigdata.journal.Journal;
- import com.bigdata.journal.RootBlockView;
  import com.bigdata.objndx.BTree;
  import com.bigdata.objndx.IIndex;
--- 63,69 ----
  import org.openrdf.model.Value;

+ import com.bigdata.journal.ICommitRecord;
  import com.bigdata.journal.IJournal;
  import com.bigdata.journal.Journal;
  import com.bigdata.objndx.BTree;
  import com.bigdata.objndx.IIndex;
***************
*** 85,89 ****
  import com.bigdata.rdf.serializers.TermIdSerializer;
  import com.bigdata.scaleup.PartitionedIndex;
- import com.bigdata.scaleup.PartitionedJournal;
  import com.bigdata.scaleup.SlaveJournal;
  import com.ibm.icu.text.Collator;
--- 85,88 ----
***************
*** 220,224 ****
       * by the store.
       */
!     static public transient final int ROOT_COUNTER = RootBlockView.FIRST_USER_ROOT + 5;

      public RdfKeyBuilder keyBuilder;
--- 219,223 ----
       * by the store.
       */
!     static public transient final int ROOT_COUNTER = ICommitRecord.FIRST_USER_ROOT;

      public RdfKeyBuilder keyBuilder;
***************
*** 349,353 ****
              long addr;

!             if ((addr = getAddr(ROOT_COUNTER)) == 0L) {

                  // Note: first termId is ONE (1). Zero is reserved.
--- 348,352 ----
              long addr;

!             if ((addr = getRootAddr(ROOT_COUNTER)) == 0L) {

                  // Note: first termId is ONE (1). Zero is reserved.
***************
*** 1021,1029 ****
      }

!     public void commit() {

          final long begin = System.currentTimeMillis();

!         super.commit();

          final long elapsed = System.currentTimeMillis() - begin;
--- 1020,1028 ----
      }

!     public long commit() {

          final long begin = System.currentTimeMillis();

!         final long commitTime = super.commit();

          final long elapsed = System.currentTimeMillis() - begin;
***************
*** 1033,1036 ****
--- 1032,1037 ----
          usage();

+         return commitTime;
+         
      }

[cweb-CVS] bigdata-rdf/src/java/com/bigdata/rdf TempTripleStore.java, 1.1, 1.2 TripleStore.java, 1.

[cweb-CVS] bigdata-rdf/src/java/com/bigdata/rdf TempTripleStore.java, 1.1, 1.2 TripleStore.java, 1.15, 1.16