[DL-Learner SVN] SF.net SVN: dl-learner:[3391] trunk/scripts

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3391
          http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3391&view=rev
Author:   kmpf
Date:     2011-11-10 08:15:41 +0000 (Thu, 10 Nov 2011)
Log Message:
-----------
Bug fixed SPARQL query that retrieves the data used in the intern PDBRdfModel.

Modified Paths:
--------------
    trunk/scripts/pom.xml
    trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java
    trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java
    trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java

Modified: trunk/scripts/pom.xml
===================================================================

--- trunk/scripts/pom.xml	2011-11-09 10:02:03 UTC (rev 3390)
+++ trunk/scripts/pom.xml	2011-11-10 08:15:41 UTC (rev 3391)
@@ -64,5 +64,11 @@
 			<artifactId>commons-compress</artifactId>
 			<version>1.2</version>
 		</dependency>
+		<dependency>
+			<groupId>weka</groupId>
+			<artifactId>weka</artifactId>
+			<version>3.6.5</version>
+		</dependency>
+		
     </dependencies>
 </project>

Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java
===================================================================
--- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java	2011-11-09 10:02:03 UTC (rev 3390)
+++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java	2011-11-10 08:15:41 UTC (rev 3391)
@@ -109,8 +109,10 @@
 		/*
 		 * data for test purpose
 		 */
-		PDBProtein testProtein = new PDBProtein("1XFF");
+		PDBProtein testProtein = new PDBProtein("1XFF","A");
 //		PDBProtein testProtein = new PDBProtein("1LMB", "3");
+//		PDBProtein testProtein = new PDBProtein("8ABP");
+
 		
 		/*
 		 * create a training data set
@@ -185,7 +187,8 @@
 				trainmodel.removeStatementsWithPoperty(endsAt);
 				Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue");
 				trainmodel.removeStatementsWithObject(residue);
-				
+				Property isPartOf = ResourceFactory.createProperty("http://purl.org/dc/terms/", "isPartOf");
+				trainmodel.removeStatementsWithPoperty(isPartOf);
 				/*
 				 * we add the information which amino acid is the fourth predecessor of which other amino acid 
 				 */
@@ -228,20 +231,23 @@
 				 * proteins that originate from that particular species. If it already exists
 				 * we will append to it.
 				 */
-				File speciesProteins = new File(_dataDir + protein.getSpecies() + ".pos");
 				
-				try {
-					String line =  protein.getPdbID() + "." + protein.getChainID()  + "." + protein.getSpecies() + "\n";
-					FileWriter out = new FileWriter(speciesProteins, true);
-					_logger.debug("Write " + line + "to file " + speciesProteins.getPath());
-					out.write(line);
-					out.close();
-				} catch (FileNotFoundException e) {
-					_logger.error("Could not find file " + speciesProteins.getPath() );
-					e.printStackTrace();
-				} catch (IOException e) {
-					_logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() );
-					e.printStackTrace();
+				if (protein.getSpecies() != ""){
+					File speciesProteins = new File(_dataDir + protein.getSpecies() + ".pos");
+					
+					try {
+						String line =  protein.getPdbID() + "." + protein.getChainID()  + "." + protein.getSpecies() + "\n";
+						FileWriter out = new FileWriter(speciesProteins, true);
+						_logger.debug("Write " + line + " to file " + speciesProteins.getPath());
+						out.write(line);
+						out.close();
+					} catch (FileNotFoundException e) {
+						_logger.error("Could not find file " + speciesProteins.getPath() + speciesProteins.getName());
+						e.printStackTrace();
+					} catch (IOException e) {
+						_logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() + speciesProteins.getName());
+						e.printStackTrace();
+					}
 				}
 			}
 		}

Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java
===================================================================
--- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java	2011-11-09 10:02:03 UTC (rev 3390)
+++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java	2011-11-10 08:15:41 UTC (rev 3391)
@@ -100,12 +100,11 @@
 			"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " +
 			"PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " +
 			"PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " +
-			"CONSTRUCT { pdb:" + this.getProtein().getPdbID() + "/extraction/source/gene/organism rdfs:label ?species. }" +
+			"CONSTRUCT { <http://bio2rdf.org/pdb:" + this.getProtein().getPdbID() + "/extraction/source/gene/organism> rdfs:label ?species. }" +
     		"WHERE { ?x1 dcterms:isPartOf ?x2 ." +
-	    		" ?x1 rdf:type> ?x3 ." +
+	    		" ?x1 rdf:type ?x3 ." +
 	    		" ?x1 pdb:isImmediatelyBefore ?x4 ." +
-				" ?x5 rdfs:label ?species " +
-				" FILTER (str(?x5) = fn:concat(str(?x2), '/extraction/source/gene/organism')) . }";
+				" OPTIONAL { ?x5 rdfs:label ?species FILTER (str(?x5) = fn:concat(str(?x2), '/extraction/source/gene/organism')) . } . }";
 		
 		// System.out.println(queryString);
 		
@@ -151,7 +150,7 @@
 				NodeIterator niter = model.listObjectsOfProperty(nextRes, hasValue);
 				sequence = niter.next().toString();
 				
-				System.out.println(sequence);
+				System.out.println("Sequence: " + sequence);
 			}
 		}
     	return sequence;
@@ -206,23 +205,25 @@
 	    		" ?x3 dcterms:isPartOf ?x4 ." +
 	    		" ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." +
 	    		" ?x5 dcterms:isPartOf ?x4 ." +
-	    		" ?x5 rdf:type ?x6 .";
+	    		" ?x5 rdf:type ?x6 ." +
+	    		" ?x5 pdb:hasChainPosition ?x8 . " +
+	    		" OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . ";
+		 
 		 if (chainID.length() == 1 && pdbID.length() == 4)
 			{
 				queryString +=
-						" ?x5 pdb:hasChainPosition ?x8 ." +
-						" ?x8 dcterms:isPartOf pdb:" + 
+						" ?x8 dcterms:isPartOf <http://bio2rdf.org/pdb:" + 
 								pdbID.toUpperCase() +
 								"/chain_" + chainID.toUpperCase() + "> .";
 			}
 		 queryString +=
-				" ?organism rdfs:label ?organismName " +
-				"FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " +
+				" ?x4 pdb:hasPolymerSequence ?seq . " +
 	    		" ?seq rdf:type pdb:PolymerSequence . " +
-	    		" ?seq pdb:hasValue ?sequence ." +
+	    		" ?seq pdb:hasValue ?sequence . " +
 	    		// with the Optional clause i get the information by which amino acid
 	    		// a amino acid is followed
-	    		" OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } .}";
+	    		" OPTIONAL { ?organism rdfs:label ?organismName " +
+	    			"FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . } . }";
 		
 		System.out.println(queryString);
 		Query query = QueryFactory.create(queryString);
@@ -324,6 +325,7 @@
 		while (riter.hasNext()) {
 			// Initialization of variables needed
 			Resource firstAA = riter.nextResource();
+			System.out.println("First AA: " + firstAA.getLocalName());
 			Resource currentAA  = firstAA;
 			Resource nextAA = firstAA;
 			boolean inHelix = false;
@@ -358,7 +360,16 @@
 			} while (currentAA.hasProperty(iib)) ;
 		}
 		_positives = pos;
+		System.out.println("+++ Positive set +++");
+		for (int i = 0; i < pos.size(); i++){
+			System.out.println("Das " + i + "te Element: " + pos.get(i).getLocalName());
+		}
+		
 		_negatives = neg;
+		System.out.println("+++ Negatvie set +++");
+		for (int i = 0; i < neg.size(); i++){
+			System.out.println("Das " + i + "te Element: " + neg.get(i).getLocalName());
+		}
 	}
 	
 	public void createFastaFile(String dir){
@@ -366,23 +377,24 @@
 			String fastaFilePath = dir + this.getProtein().getFastaFileName();
 			PrintStream out = new PrintStream (new File(fastaFilePath));
 			out.println(">" + this.getProtein().getPdbID() + "." + this.getProtein().getChainID() + "." + this.getProtein().getSpecies());
-			int seqLength = this.getProtein().getSequence().length();
+			String sequence = this.getProtein().getSequence();
+			int seqLength = sequence.length();
 			
 			if (seqLength > 80) {
 				// write sequence in 80 character blocks into file
 				int beginIndex = 0;
 				int endIndex = 80;
-				for (int i = 1;  endIndex <= seqLength; i++ ){
-					out.println(this.getProtein().getSequence().substring(beginIndex, endIndex));
-					if (seqLength - endIndex <= 80){
-						out.println(this.getProtein().getSequence().substring(endIndex, seqLength));
+				while ( endIndex <= seqLength ){
+					out.println(sequence.substring(beginIndex, endIndex));
+					if (seqLength - endIndex < 80){
+						out.println(sequence.substring(endIndex, seqLength));
 					}
 					beginIndex = endIndex;
-					endIndex += (i * 80);
+					endIndex += 80;
 				}
 				
 			} else {
-				out.println(this.getProtein().getSequence());
+				out.println(sequence);
 			}
 			out.close();
 		} catch (IOException e) {

Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java
===================================================================
--- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java	2011-11-09 10:02:03 UTC (rev 3390)
+++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java	2011-11-10 08:15:41 UTC (rev 3391)
@@ -1,11 +1,6 @@
 package org.dllearner.examples.pdb;
 
 import java.io.File;
-import java.io.IOException;
-
-import org.apache.log4j.Logger;
-/*
-import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.Random;
@@ -20,14 +15,14 @@
 import weka.classifiers.bayes.NaiveBayes;
 import weka.core.Instances;
 import weka.core.converters.ConverterUtils.DataSource;
-*/
 
+
 public class PDBWekaLearner {
 	
 	private static Logger logger = Logger.getRootLogger();
 	
 	public PDBWekaLearner (File arffFile) throws IOException{
-	/*	
+		
 		// create logger (configure this to your needs)
 		SimpleLayout layout = new SimpleLayout();
 		FileAppender fileAppender = new FileAppender(layout, "log/sample_log.txt", false);
@@ -75,6 +70,6 @@
 		} catch (Exception e){
 			e.printStackTrace();
 		}
-		*/
+		
 	}
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.