[cweb-CVS] bigdata-rdf/src/java/com/bigdata/rdf TripleStore.java, 1.30, 1.31

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf
In directory sc8-pr-cvs4.sourceforge.net:/tmp/cvs-serv22601/src/java/com/bigdata/rdf

Modified Files:
	TripleStore.java 
Log Message:
testing SAIL and lubm, including adding BTree#removeAll(), touching up some inferences, making it possible to load different RDF interchange formats, and adding JOIN ordering based on the sesame optimizer and the actual triple pattern selectivity in the data.

Index: TripleStore.java
===================================================================
RCS file: /cvsroot/cweb/bigdata-rdf/src/java/com/bigdata/rdf/TripleStore.java,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** TripleStore.java	15 Apr 2007 18:15:55 -0000	1.30
--- TripleStore.java	18 Apr 2007 17:29:08 -0000	1.31
***************
*** 55,59 ****
--- 55,61 ----
  import java.io.Reader;
  import java.util.Arrays;
+ import java.util.HashMap;
  import java.util.Locale;
+ import java.util.Map;
  import java.util.Properties;
  import java.util.UUID;
***************
*** 64,67 ****
--- 66,75 ----
  import org.openrdf.model.URI;
  import org.openrdf.model.Value;
+ import org.openrdf.rio.Parser;
+ import org.openrdf.sesame.constants.RDFFormat;
+ import org.openrdf.vocabulary.OWL;
+ import org.openrdf.vocabulary.RDF;
+ import org.openrdf.vocabulary.RDFS;
+ import org.openrdf.vocabulary.XmlSchema;

  import com.bigdata.btree.BTree;
***************
*** 97,100 ****
--- 105,110 ----
   * A triple store based on the <em>bigdata</em> architecture.
   * 
+  * @todo verify that re-loading the same data does not cause index writes.
+  * 
   * @todo Refactor to support transactions and concurrent load/query and test
   *       same.
***************
*** 406,409 ****
--- 416,425 ----
          keyBuilder = new RdfKeyBuilder(_keyBuilder);

+         // setup namespace mapping for serialization utility methods.
+         addNamespace(RDF.NAMESPACE, "rdf");
+         addNamespace(RDFS.NAMESPACE, "rdfs");
+         addNamespace(OWL.NAMESPACE, "owl");
+         addNamespace(XmlSchema.NAMESPACE, "xsd");
+ 
      }

***************
*** 555,561 ****
          getSPOIndex().insert(keyBuilder.statement2Key(_s, _p, _o),null);
          getPOSIndex().insert(keyBuilder.statement2Key(_p, _o, _s),null);
!         getOSPIndex().insert(keyBuilder.statement2Key(_p, _s, _p),null);

      }

      /**
--- 571,598 ----
          getSPOIndex().insert(keyBuilder.statement2Key(_s, _p, _o),null);
          getPOSIndex().insert(keyBuilder.statement2Key(_p, _o, _s),null);
!         getOSPIndex().insert(keyBuilder.statement2Key(_o, _s, _p),null);

      }
+     
+     /*
+      * @todo move this serialization stuff into a utility class.
+      */
+     
+     // namespace to prefix.
+     private final Map<String, String> uriToPrefix = new HashMap<String, String>();
+     
+     /**
+      * Defines a transient mapping from a URI to a namespace prefix that will be
+      * used for that URI by {@link #toString()}.
+      * 
+      * @param namespace
+      * 
+      * @param prefix
+      */
+     protected void addNamespace(String namespace, String prefix) {
+     
+         uriToPrefix.put(namespace, prefix);
+ 
+     }

      /**
***************
*** 566,591 ****
          IIndex ndx = getIdTermIndex();

!         URI s1 = (URI) ndx.lookup(keyBuilder.id2key(s));

          URI p1 = (URI) ndx.lookup(keyBuilder.id2key(p));

!         URI o1 = (URI) ndx.lookup(keyBuilder.id2key(o));

!         return ("< "+abbrev(s1)+", "+abbrev(p1)+", "+abbrev(o1)+" >");

      }

!     // @todo substitute in well know namespaces (rdf, rdfs, etc).
      private String abbrev( URI uri ) {

!         String t = uri.getURI();

!         int index = t.lastIndexOf('#');

!         if(index==-1) return t;

!         return t.substring(index);

      }

      /**
--- 603,674 ----
          IIndex ndx = getIdTermIndex();

!         Resource s1 = (Resource) ndx.lookup(keyBuilder.id2key(s));

          URI p1 = (URI) ndx.lookup(keyBuilder.id2key(p));

!         Value o1 = (Value) ndx.lookup(keyBuilder.id2key(o));

!         return ("< " + (s1 instanceof URI ? abbrev((URI) s1) : s1) + ", "
!                 + abbrev(p1) + ", "
!                 + (o1 instanceof URI ? abbrev((URI) o1) : o1) + " >");

      }

!     /**
!      * Substitutes in well know namespaces (rdf, rdfs, etc).
!      */
      private String abbrev( URI uri ) {

!         String uriString = uri.getURI();

! //        final int index = uriString.lastIndexOf('#');
! //        
! //        if(index==-1) return uriString;
! //
! //        final String namespace = uriString.substring(0, index);

!         final String namespace = uri.getNamespace();

!         final String prefix = uriToPrefix.get(namespace);
!         
!         if(prefix != null) {
!             
!             return prefix+":"+uri.getLocalName();
!             
!         } else return uriString;

      }
+ 
+     /**
+      * Utility method dumps the statements in the store onto {@link System#err}
+      * using the SPO index (subject order).
+      */
+     public void dumpStore() {
+ 
+         final int nstmts = getStatementCount();
+         
+         System.err.println("#statements="+nstmts);
+         
+         IEntryIterator itr = getSPOIndex().rangeIterator(null, null);
+ 
+         int i = 0;
+         
+         while (itr.hasNext()) {
+ 
+             itr.next();
+             
+             i++;
+             
+             SPO spo = new SPO(KeyOrder.SPO,keyBuilder,itr.getKey());
+ 
+             System.err.println("#" + i + "\t" + toString(spo.s, spo.p, spo.o));
+             
+         }
+         
+     }
+     
+     /*
+      * 
+      */

      /**
***************
*** 864,869 ****
       * @return The #of statements removed.
       * 
!      * @todo the {@link #keyBuilder} is, which means that this is NOT thread
!      *       safe.
       * 
       * @todo this is not using the batch btree api.
--- 947,952 ----
       * @return The #of statements removed.
       * 
!      * @todo the {@link #keyBuilder} is being used, which means that this is NOT
!      *       thread safe.
       * 
       * @todo this is not using the batch btree api.
***************
*** 883,887 ****

                  getSPOIndex().remove(key);
! 
                  return 1;

--- 966,972 ----

                  getSPOIndex().remove(key);
!                 getPOSIndex().remove(keyBuilder.statement2Key(_p, _o, _s));
!                 getOSPIndex().remove(keyBuilder.statement2Key(_o, _s, _p));
!                 
                  return 1;

***************
*** 959,962 ****
--- 1044,1048 ----

                  }
+                 
              }

***************
*** 987,990 ****
--- 1073,1079 ----
      /**
       * Value used for a "NULL" term identifier.
+      * 
+      * @todo use this throughout rather than "0" since the value should really
+      *       be an <em>unsigned long</em>).
       */
      public static final long NULL = 0L;
***************
*** 1456,1459 ****
--- 1545,1550 ----
       * @return The pre-assigned termId -or- 0L iff the term is not known to the
       *         database.
+      * 
+      * @todo cache some well-known values?  E.g., this defined by the InferenceEngine.
       */
      public long getTermId(Value value) {
***************
*** 1587,1605 ****

      }
-     
-     /**
-      * Load a file into the triple store.
-      *
-      * @param file The file.
-      * 
-      * @param baseURI The baseURI or "" if none is known.
-      * 
-      * @throws IOException
-      */
-     public void loadData(File file, String baseURI ) throws IOException {
-         
-         loadData(file,baseURI,true);
-         
-     }

      /**
--- 1678,1681 ----
***************
*** 1639,1660 ****
      }

      /**
!      * Load a file into the triple store.
       * 
       * @param file
       *            The file.
       * @param baseURI
!      *            The baseURI or "" if none is known.
       * @param commit
       *            A {@link #commit()} will be performed IFF true.
       * 
!      * @return Statistics about the file load operation.
       * 
-      * @todo add a parameter for the batch size. large buffers for lots of small
-      *       files probably means lots of heap churn.
-      *
       * @throws IOException
       */
!     public LoadStats loadData(File file, String baseURI, boolean commit ) throws IOException {

          final long begin = System.currentTimeMillis();
--- 1715,1754 ----
      }

+ //  /**
+ //   * Load a file into the triple store.
+ //   *
+ //   * @param file The file.
+ //   * 
+ //   * @param baseURI The baseURI or "" if none is known.
+ //   * 
+ //   * @throws IOException
+ //   */
+ //  public void loadData(File file, String baseURI ) throws IOException {
+ //      
+ //      loadData(file,baseURI,true);
+ //      
+ //  }
+ 
      /**
!      * Load data into the triple store.
       * 
       * @param file
       *            The file.
       * @param baseURI
!      *            The baseURI or <code>""</code> if none is known.
!      * @param rdfFormat
!      *            The RDF interchange syntax to be parsed.
!      * @param verifyData
!      *            Controls the {@link Parser#setVerifyData(boolean)} option.
       * @param commit
       *            A {@link #commit()} will be performed IFF true.
       * 
!      * @return Statistics about the data load operation.
       * 
       * @throws IOException
+      *             if there is a problem when parsing the data.
       */
!     public LoadStats loadData(File file, String baseURI, RDFFormat rdfFormat,
!             boolean verifyData, boolean commit) throws IOException {

          final long begin = System.currentTimeMillis();
***************
*** 1664,1671 ****
          log.debug( "loading: " + file.getAbsolutePath() );

!         Reader reader = new BufferedReader(new InputStreamReader(
!                 new FileInputStream(file)));
! 
!         IRioLoader loader = new PresortRioLoader( this );

          loader.addRioLoaderListener( new RioLoaderListener() {
--- 1758,1762 ----
          log.debug( "loading: " + file.getAbsolutePath() );

!         IRioLoader loader = new PresortRioLoader( this, rdfFormat, verifyData );

          loader.addRioLoaderListener( new RioLoaderListener() {
***************
*** 1685,1691 ****
          });

          try {

!             loader.loadRdfXml( reader, baseURI );

              long nstmts = loader.getStatementsAdded();
--- 1776,1790 ----
          });

+         /*
+          * @todo change to use correct Parser method depending on Reader vs
+          * InputStream (SAX Source).  Changing this means updating all of
+          * the parser implementations, not just the PresortRioLoader.
+          */
+         Reader reader = new BufferedReader(new InputStreamReader(
+                 new FileInputStream(file)));
+         
          try {

!             loader.loadRdf( reader, baseURI );

              long nstmts = loader.getStatementsAdded();