From: <km...@us...> - 2011-01-11 23:04:53
|
Revision: 2598 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2598&view=rev Author: kmpf Date: 2011-01-11 23:04:45 +0000 (Tue, 11 Jan 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-01-11 14:38:13 UTC (rev 2597) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-01-11 23:04:45 UTC (rev 2598) @@ -1,11 +1,13 @@ package org.dllearner.examples.pdb; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; +import java.io.PrintStream; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; + import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -13,9 +15,10 @@ import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; -import com.hp.hpl.jena.query.ResultSet; -import com.hp.hpl.jena.query.ResultSetFormatter; -import com.hp.hpl.jena.rdf.model.Model; +import com.hp.hpl.jena.rdf.model.Property; +import com.hp.hpl.jena.rdf.model.ResIterator; +import com.hp.hpl.jena.rdf.model.Resource; +import com.hp.hpl.jena.rdf.model.ResourceFactory; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; import com.dumontierlab.pdb2rdf.parser.PdbXmlParser; @@ -24,50 +27,57 @@ public class HelixRDFCreator { + + private static ArrayList<Resource> positives; + private static ArrayList<Resource> negatives; /** * @param args + * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { - TrainAndTestSet sets = new TrainAndTestSet(2); - String[] trainIDs = sets.getTrainset(); + + + TrainAndTestSet sets = new TrainAndTestSet(1); PdbRdfModel trainmodel = new PdbRdfModel(); + trainmodel.add(getRdfModelForIds(sets.getTrainset())); + /* + * String[] id = {"200L"}; + * trainmodel.add(getRdfModelForIds(id)); + */ - for(int i = 0; i < trainIDs.length; i++){ - System.out.println(trainIDs[i]); - String[] ID = {trainIDs[i]}; - trainmodel.add(getRdfModelForIds(ID)); - } - - getRdfModelForIds(sets.getTrainset()); // PdbRdfModel testmodel = getRdfModelForIds(sets.getTestset()); + ResIterator niter = getFirstAA(trainmodel); + + /* take all amino acids which are in helices and put them into the + * positives ArrayList, and all others in the negatives ArrayList + */ + createPositivesAndNegatives(niter, trainmodel); + + /* + * writes the conf-File + */ + createConfFile(); + + try { - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb> " + - "CONSTRUCT { ?x1 ?x2 ?x3 .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> . ?x1 ?x2 ?x3 .}"; - Query query = QueryFactory.create(queryString); + SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd"); + String filename = "Helixtrainer" + df.format(new Date()) + ".rdf"; + PrintStream out = new PrintStream (new File(filename)); + + // Output query results + trainmodel.write(out, "RDF/XML"); - // Execute the query and obtain results - QueryExecution qe = QueryExecutionFactory.create(query, trainmodel); - Model construct = qe.execConstruct(); - OutputStream out = new FileOutputStream (new File("qwertzu_iop.rdf")); - // Output query results - construct.write(out, "RDF/XML"); - - - - - // Important - free up resources used running the query + // Important - free up resources used running the query + out.close(); } - catch (FileNotFoundException e) + catch (IOException e) { - System.err.println("Datei nicht gefunden!"); + System.err.println("OutputStream konnte nicht geschlossen werden!"); } - } private static PdbRdfModel getRdfModelForIds(String[] pdbIDs) { @@ -81,11 +91,10 @@ while (i.hasNext()) { final InputSource input = i.next(); - PdbRdfModel model = new PdbRdfModel(); - model = parser.parse(input, new PdbRdfModel()); + PdbRdfModel model = parser.parse(input, new PdbRdfModel()); // jedes Model muss gleich nach den relevanten Daten durchsucht werden, // da ansonsten Probleme mit der Speichergröße auftreten können. - allmodels.add(getHelices(model)); + allmodels.add(getData(model)); } } catch (IOException e) { @@ -98,36 +107,156 @@ return allmodels; } - private static PdbRdfModel collectData (PdbRdfModel inmodel) { - PdbRdfModel collectmodel = new PdbRdfModel(); - collectmodel.add(getHelices(inmodel)); + private static PdbRdfModel getData(PdbRdfModel model) { + + // Beispiel einer SELECT Abfrage + /* String selectQuery = + * "SELECT { ?x1 ?x2 ?x3 .} " + + * "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> .}"; + * Query query = QueryFactory.create(selectQuery); + * QueryExecution qe = QueryExecutionFactory.create(query, model); + * ResultSet select = qe.execSelect(); + * ResultSetFormatter.out (System.out, select, query); + * + */ - - return collectmodel; - } - - private static Model getHelices(PdbRdfModel model) { - // Zweimal dasselbe Ergebnis einmal als SELECT und einmal als CONSTRUCT (für weitere Bearbeitung) Statement - // SELECT Abfrage + // CONSTRUCT Abfrage + + PdbRdfModel construct = new PdbRdfModel(); String queryString = - " SELECT ?x1 ?x2 ?x3 WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> . ?x1 ?x2 ?x3 .}"; + /* i do it kind of difficult, but i want to be certain that i only get the sequences of + Polypeptides(L) which contain at least one Helix. Furthermore i collect the information + about at which position helices begin and end. + NOTE: this information has to be removed before oututing the model. But i will use this + to check for positive and negative train amino acids + */ + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + + "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + + " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + + " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + // with the Optional clause i get the information by which amino acid + // a amino acid is followed + " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, model); - ResultSet select = qe.execSelect(); - ResultSetFormatter.out (System.out, select, query); - // CONSTRUCT Abfrage - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb> " + - "CONSTRUCT { ?x1 ?x2 ?x3 .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> . ?x1 ?x2 ?x3 .}"; - query = QueryFactory.create(queryString); - - // Execute the query and obtain results - qe = QueryExecutionFactory.create(query, model); - Model construct = qe.execConstruct(); + construct.add(qe.execConstruct()); qe.close(); return construct; } + + private static ResIterator getFirstAA( PdbRdfModel model) { + PdbRdfModel construct = new PdbRdfModel(); + /* i search for all amino acids (AA) that have a successor + * but do not have a predecessor -> it's the first AA of every + * polypeptide chain + */ + + String queryString = + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "CONSTRUCT { ?x1 pdb:isImmediatelyBefore ?x2 . } " + + "WHERE { ?x1 pdb:isImmediatelyBefore ?x2 . " + + // NOT EXISTS can be used with SPARQL 1.1 + //"NOT EXISTS { ?x3 pdb:isImmediatelyBefore ?x1 . } }"; + " OPTIONAL { ?x3 pdb:isImmediatelyBefore ?x1 . } " + + " FILTER ( !BOUND(?x3) ) }"; + Query query = QueryFactory.create(queryString); + QueryExecution qe = QueryExecutionFactory.create(query, model); + construct.add(qe.execConstruct()); + qe.close(); + ResIterator niter = construct.listSubjects(); + return niter; + } + + private static void createPositivesAndNegatives(ResIterator riter, PdbRdfModel model) { + + // Properties i have to use to check for while going through the AA-chain + Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); + Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + ArrayList<Resource> pos = new ArrayList<Resource>(); + ArrayList<Resource> neg = new ArrayList<Resource>(); + + + // every first amino acid indicates a new AA-chain + while (riter.hasNext()) { + // Initialization of variables needed + Resource aaOne = riter.nextResource(); + Resource obj = aaOne; + Resource nobj = aaOne; + boolean inHelix = false; + + // look if there is a next AA + do { + // looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property + obj = nobj; + // die Guten ins Töpfchen ... + // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-chain + if(model.listResourcesWithProperty(ba, obj).hasNext() && !inHelix ){ + inHelix = true; + System.out.println("Entering Helix!"); + } + // die Schlechten ins Kröpfchen + // if we get an non-empty iterator for pdb:endsAt and are already within a AA-chain + // the AAs AFTER the current ones aren't within a helix + if (model.listResourcesWithProperty(ea, obj).hasNext() && inHelix){ + inHelix = false; + System.out.println("Leaving Helix!"); + } + // get next AA if there is one + if (model.listObjectsOfProperty(obj, iib).hasNext()){ + nobj = model.getProperty(obj, iib).getResource(); + } + + // do something different if we are in a helix + if (inHelix){ + pos.add(obj); + System.out.println(obj.getURI() + " " + iib.getURI() + " " + nobj.getURI() + " we are in!"); + } else { + neg.add(obj); + System.out.println(obj.getURI() + " " + iib.getURI() + " " + nobj.getURI()); + } + + } while (obj.hasProperty(iib)) ; + } + positives = pos; + negatives = neg; + } + private static void createConfFile(){ + try + { + SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd"); + String filename = "pdb" + df.format(new Date()) + ".conf"; + PrintStream out = new PrintStream (new File(filename)); + + out.println("import(\"AA_properties.owl\");"); + out.println("import(\"" + filename + "\");"); + out.println(); + + for (int i = 0 ; i < positives.size() ; i++ ) { + out.println("+\"" + positives.get(i).getURI() + "\""); + } + + for (int i = 0 ; i < negatives.size() ; i++ ) { + out.println("-\"" + negatives.get(i).getURI() + "\""); + } + // Important - free up resources used running the query + out.close(); + } + catch (IOException e) + { + System.err.println("OutputStream konnte nicht geschlossen werden!"); + } + } } \ No newline at end of file Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-01-11 14:38:13 UTC (rev 2597) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-01-11 23:04:45 UTC (rev 2598) @@ -1,8 +1,5 @@ package org.dllearner.examples.pdb; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; @@ -81,14 +78,11 @@ this.pdbprotlines = arraylist; return count; - } catch (FileNotFoundException e) { + } + catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return 0; - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - return 0; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-03-31 06:34:24
|
Revision: 2741 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2741&view=rev Author: kmpf Date: 2011-03-31 06:34:17 +0000 (Thu, 31 Mar 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-03-29 16:09:59 UTC (rev 2740) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-03-31 06:34:17 UTC (rev 2741) @@ -1,6 +1,7 @@ package org.dllearner.examples.pdb; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; @@ -11,6 +12,15 @@ import java.util.Iterator; import java.util.Set; +import org.dllearner.cli.Start; +import org.dllearner.core.ComponentInitException; +import org.dllearner.core.ComponentManager; +import org.dllearner.core.KnowledgeSource; +import org.dllearner.core.ReasonerComponent; +import org.dllearner.core.owl.Description; +import org.dllearner.core.owl.Individual; +import org.dllearner.kb.OWLFile; +import org.dllearner.reasoning.FastInstanceChecker; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -61,6 +71,8 @@ private static String saveDir = "../test/pdb/"; + private static HashMap<Resource, File> confFilePerResidue; + private static File confFileForAll; /** * @param args @@ -70,8 +82,9 @@ TrainAndTestSet sets; PdbRdfModel trainmodel = new PdbRdfModel(); - do{ - sets = new TrainAndTestSet(1); + //do{ + String pdbID = "3LQH"; + sets = new TrainAndTestSet(pdbID); trainmodel.add(getRdfModelForIds(sets.getTrainset())); /* * String[] id = {"200L"}; @@ -82,12 +95,12 @@ /* * as we have to handle several amino acid chains we need the first - * amino acid of every chain, they are returned as ResIterator + * amino acid of every chain, they are returned within a ResIterator */ ResIterator niter = getFirstAA(trainmodel); /* - * we add some distance Information to out model + * we add some distance Information to our model */ trainmodel = addDistanceInfo(trainmodel); @@ -96,7 +109,7 @@ * global positives ArrayList, and all others in the global negatives ArrayList */ createPositivesAndNegatives(niter, trainmodel); - } while(positives.size() > 100 && negatives.size() > 100); + //} while(positives.size() > 100 && negatives.size() > 100); /* * remove all triples that contain information about begin and end of helices @@ -108,21 +121,22 @@ Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); trainmodel = removeStatementsWithObject(trainmodel, residue); - /* - * writes the conf-Files - */ + SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH:mm"); + String date = df.format(new Date()); + String rdffile= "Helixtrainer" + date + ".rdf"; + String filename = saveDir + rdffile; - try { - SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH:mm"); - String date = df.format(new Date()); - String rdffile= "Helixtrainer" + date + ".rdf"; - String filename = saveDir + rdffile; + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ createConfFile(date, rdffile, trainmodel); PrintStream out = new PrintStream (new File(filename)); - // Output query results + // Output results trainmodel.write(out, "RDF/XML"); // Important - free up resources used running the query @@ -132,6 +146,39 @@ { System.err.println("OutputStream konnte nicht geschlossen werden!"); } + + /* + * load RDF file and perform learn algorithm for every .conf-file + */ + + Start start = null; + Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); + while ( aa.hasNext() ){ + Resource nextRes = aa.next(); + /* + ComponentManager cm = ComponentManager.getInstance(); + KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); + cm.applyConfigEntry(ks, "url","file://" + filename ); + ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); + rc.init(); + */ + System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); + try{ + start = new Start(confFilePerResidue.get(nextRes)); + } catch (ComponentInitException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (org.dllearner.confparser.ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + start.start(false); + Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); + System.out.println(d.toKBSyntaxString()); + } + } private static PdbRdfModel getRdfModelForIds(String[] pdbIDs) { @@ -239,7 +286,6 @@ " ?x2 pdb:isImmediatelyBefore ?x3 . " + " ?x3 pdb:isImmediatelyBefore ?x4 . " + " ?x4 pdb:isImmediatelyBefore ?x5 . }"; - System.out.println(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, model); model.add(qe.execConstruct()); @@ -343,40 +389,67 @@ { // the file with all amino acids String pdbname = saveDir + "pdb" + date + ".conf"; - PrintStream out = new PrintStream (new File(pdbname)); + confFileForAll = new File(pdbname); + PrintStream out = new PrintStream (confFileForAll); // add import statements out.println("import(\"AA_properties.owl\");"); out.println("import(\"" + rdffile + "\");"); out.println(); + HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); + resConfFiles.put(ala, new File(saveDir + ala.getLocalName() + date + ".conf")); + resConfFiles.put(cys, new File(saveDir + cys.getLocalName() + date + ".conf")); + resConfFiles.put(asp, new File(saveDir + asp.getLocalName() + date + ".conf")); + resConfFiles.put(glu, new File(saveDir + glu.getLocalName() + date + ".conf")); + resConfFiles.put(phe, new File(saveDir + phe.getLocalName() + date + ".conf")); + resConfFiles.put(gly, new File(saveDir + gly.getLocalName() + date + ".conf")); + resConfFiles.put(his, new File(saveDir + his.getLocalName() + date + ".conf")); + resConfFiles.put(ile, new File(saveDir + ile.getLocalName() + date + ".conf")); + resConfFiles.put(lys, new File(saveDir + lys.getLocalName() + date + ".conf")); + resConfFiles.put(leu, new File(saveDir + leu.getLocalName() + date + ".conf")); + resConfFiles.put(met, new File(saveDir + met.getLocalName() + date + ".conf")); + resConfFiles.put(asn, new File(saveDir + asn.getLocalName() + date + ".conf")); + resConfFiles.put(pro, new File(saveDir + pro.getLocalName() + date + ".conf")); + resConfFiles.put(gln, new File(saveDir + gln.getLocalName() + date + ".conf")); + resConfFiles.put(arg, new File(saveDir + arg.getLocalName() + date + ".conf")); + resConfFiles.put(ser, new File(saveDir + ser.getLocalName() + date + ".conf")); + resConfFiles.put(thr, new File(saveDir + thr.getLocalName() + date + ".conf")); + resConfFiles.put(val, new File(saveDir + val.getLocalName() + date + ".conf")); + resConfFiles.put(trp, new File(saveDir + trp.getLocalName() + date + ".conf")); + resConfFiles.put(tyr, new File(saveDir + tyr.getLocalName() + date + ".conf")); + resConfFiles.put(sel, new File(saveDir + sel.getLocalName() + date + ".conf")); + confFilePerResidue = resConfFiles; + + + // put all amino acid resources and the their conf-files together HashMap<Resource, PrintStream> resprint = new HashMap<Resource, PrintStream>(30); - resprint.put(ala, new PrintStream(new File(saveDir + ala.getLocalName() + date + ".conf"))); - resprint.put(cys, new PrintStream(new File(saveDir + cys.getLocalName() + date + ".conf"))); - resprint.put(asp, new PrintStream(new File(saveDir + asp.getLocalName() + date + ".conf"))); - resprint.put(glu, new PrintStream(new File(saveDir + glu.getLocalName() + date + ".conf"))); - resprint.put(phe, new PrintStream(new File(saveDir + phe.getLocalName() + date + ".conf"))); - resprint.put(gly, new PrintStream(new File(saveDir + gly.getLocalName() + date + ".conf"))); - resprint.put(his, new PrintStream(new File(saveDir + his.getLocalName() + date + ".conf"))); - resprint.put(ile, new PrintStream(new File(saveDir + ile.getLocalName() + date + ".conf"))); - resprint.put(lys, new PrintStream(new File(saveDir + lys.getLocalName() + date + ".conf"))); - resprint.put(leu, new PrintStream(new File(saveDir + leu.getLocalName() + date + ".conf"))); - resprint.put(met, new PrintStream(new File(saveDir + met.getLocalName() + date + ".conf"))); - resprint.put(asn, new PrintStream(new File(saveDir + asn.getLocalName() + date + ".conf"))); - resprint.put(pro, new PrintStream(new File(saveDir + pro.getLocalName() + date + ".conf"))); - resprint.put(gln, new PrintStream(new File(saveDir + gln.getLocalName() + date + ".conf"))); - resprint.put(arg, new PrintStream(new File(saveDir + arg.getLocalName() + date + ".conf"))); - resprint.put(ser, new PrintStream(new File(saveDir + ser.getLocalName() + date + ".conf"))); - resprint.put(thr, new PrintStream(new File(saveDir + thr.getLocalName() + date + ".conf"))); - resprint.put(val, new PrintStream(new File(saveDir + val.getLocalName() + date + ".conf"))); - resprint.put(trp, new PrintStream(new File(saveDir + trp.getLocalName() + date + ".conf"))); - resprint.put(tyr, new PrintStream(new File(saveDir + tyr.getLocalName() + date + ".conf"))); - resprint.put(sel, new PrintStream(new File(saveDir + sel.getLocalName() + date + ".conf"))); + resprint.put(ala, new PrintStream(resConfFiles.get(ala))); + resprint.put(cys, new PrintStream(resConfFiles.get(cys))); + resprint.put(asp, new PrintStream(resConfFiles.get(asp))); + resprint.put(glu, new PrintStream(resConfFiles.get(glu))); + resprint.put(phe, new PrintStream(resConfFiles.get(phe))); + resprint.put(gly, new PrintStream(resConfFiles.get(gly))); + resprint.put(his, new PrintStream(resConfFiles.get(his))); + resprint.put(ile, new PrintStream(resConfFiles.get(ile))); + resprint.put(lys, new PrintStream(resConfFiles.get(lys))); + resprint.put(leu, new PrintStream(resConfFiles.get(leu))); + resprint.put(met, new PrintStream(resConfFiles.get(met))); + resprint.put(asn, new PrintStream(resConfFiles.get(asn))); + resprint.put(pro, new PrintStream(resConfFiles.get(pro))); + resprint.put(gln, new PrintStream(resConfFiles.get(gln))); + resprint.put(arg, new PrintStream(resConfFiles.get(arg))); + resprint.put(ser, new PrintStream(resConfFiles.get(ser))); + resprint.put(thr, new PrintStream(resConfFiles.get(thr))); + resprint.put(val, new PrintStream(resConfFiles.get(val))); + resprint.put(trp, new PrintStream(resConfFiles.get(trp))); + resprint.put(tyr, new PrintStream(resConfFiles.get(tyr))); + resprint.put(sel, new PrintStream(resConfFiles.get(sel))); Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); // add import statements to .conf files for amino acids - Iterator<Resource> keys = resprint.keySet().iterator(); + Iterator<Resource> keys = resprint.keySet().iterator(); while (keys.hasNext()){ Resource k = keys.next(); Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-03-29 16:09:59 UTC (rev 2740) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-03-31 06:34:17 UTC (rev 2741) @@ -23,6 +23,15 @@ public String[] getTestset() { return testset; } + + public TrainAndTestSet (String pdbID) { + String[] pdbIDs = {pdbID}; + this.trainset = pdbIDs; + } + + public TrainAndTestSet (String[] pdbIDs) { + this.trainset = pdbIDs; + } public TrainAndTestSet (int setsize) { @@ -50,8 +59,8 @@ } // lets create Train- and Testset - this.trainset = this.create_set(setsize, linenr); - this.testset = this.create_set(setsize, linenr); + this.trainset = this.createSet(setsize, linenr); + this.testset = this.createSet(setsize, linenr); } catch (MalformedURLException e) { // TODO Auto-generated catch block @@ -108,7 +117,7 @@ */ //creates Sets of PDB IDs equal to setsize - private String [] create_set(int setsize, int linenr){ + private String [] createSet(int setsize, int linenr){ String [] set = new String [setsize]; if (this.setentries == null) { this.setentries = new HashMap<Integer,String>(2*setsize); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-05-12 06:51:52
|
Revision: 2791 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2791&view=rev Author: kmpf Date: 2011-05-12 06:51:43 +0000 (Thu, 12 May 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Added Paths: ----------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-11 12:44:11 UTC (rev 2790) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -71,9 +71,23 @@ private static ArrayList<Resource> positives; private static ArrayList<Resource> negatives; - + public void setPositives(ArrayList<Resource> pos){ + positives = pos; + } - private static String saveDir = "../test/pdb/"; + public void setNegatives(ArrayList<Resource> neg){ + negatives = neg; + } + + public ArrayList<Resource> getPositives(){ + return positives; + } + + public ArrayList<Resource> getNegatives(){ + return negatives; + } + + private static String dataDir = "../test/pdb/"; private static HashMap<Resource, File> confFilePerResidue; private static File confFileForAll; @@ -82,79 +96,133 @@ * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { + Boolean test = true; + int dataSet = 1; + /* + * get dataset files + */ + String bt426 = dataDir + "bt426.list"; + File bt426List = new File(bt426); + String plp273 = dataDir + "plp273.list"; + File plp273List = new File(plp273); + String plp364 = dataDir + "plp364.list"; + File plp364List = new File(plp364); + String plp399 = dataDir + "plp399.list"; + File plp399List = new File(plp399); + + /* + * data for test purpose + */ + String pdbID = "3LQH"; + String chainID = ""; - TrainAndTestSet sets; + /* + * generate trainset and fill trainmodel + */ PdbRdfModel trainmodel = new PdbRdfModel(); - //do{ - String pdbID = "3LQH"; - sets = new TrainAndTestSet(pdbID); - trainmodel.add(getRdfModelForIds(sets.getTrainset())); - /* - * String[] id = {"200L"}; - * trainmodel.add(getRdfModelForIds(id)); - */ + TrainAndTestSet trainSet = new TrainAndTestSet(); + + if (test) + { + trainSet = new TrainAndTestSet(pdbID, chainID); + } + else + { + switch (dataSet) { + case 1: trainSet = new TrainAndTestSet(bt426List); break; + case 2: trainSet = new TrainAndTestSet(plp273List); break; + case 3: trainSet = new TrainAndTestSet(plp364List); break; + case 4: trainSet = new TrainAndTestSet(plp399List); break; + } + } - // PdbRdfModel testmodel = getRdfModelForIds(sets.getTestset()); + + /* + * generate a PdbRdfModel for every pdbID + */ + + for (int i = 0; i < trainSet.getTrainset().length; i++) + { + String[] pdbIDs = {trainSet.getTrainset()[i].getPdbID()}; + trainmodel.removeAll(); + trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); + /* - * as we have to handle several amino acid chains we need the first + * as we have sometimes to handle several amino acid chains we need the first * amino acid of every chain, they are returned within a ResIterator */ ResIterator niter = getFirstAA(trainmodel); - ResIterator riter = getFirstAA(trainmodel); + ResIterator riter = niter; /* * we add some distance Information to our model */ trainmodel = addDistanceInfo(trainmodel); - + /* * take all amino acids which are in helices and put them into the * global positives ArrayList, and all others in the global negatives ArrayList */ createPositivesAndNegatives(niter, trainmodel); - //} while(positives.size() > 100 && negatives.size() > 100); - - SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH_mm"); - String date = df.format(new Date()); - String rdffile = "Helixtrainer" + date + ".rdf"; - String arfffile = "Helixtrainer" + date + ".arff"; - String filename = saveDir + rdffile; - - createArffFile(date, arfffile, trainmodel, sets, riter); - /* - * remove all triples that contain information about begin and end of helices - */ - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ba); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ea); - Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); - trainmodel = removeStatementsWithObject(trainmodel, residue); - - - try - { + SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH_mm"); + String date = df.format(new Date()); + String rdfFile; + String arffFile; + if (trainSet.getTrainset()[i].getChainID().length() == 0) + { + rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".rdf"; + arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".arff"; + } + else + { + rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." + + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".rdf"; + arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." + + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".arff"; + } + String dir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; + File directory = new File(dir); + directory.mkdir(); + String rdfFilePath = dir + rdfFile; + String arffFilePath = dir + arffFile; + + createArffFile(arffFilePath, trainmodel, trainSet, riter); /* - * creatConfFile() - * writes the conf-Files and saves there File-objects in: - * confFileForAll and confFilePerResidue + * remove all triples that contain information about begin and end of helices */ - createConfFile(date, rdffile, trainmodel); - PrintStream out = new PrintStream (new File(filename)); + Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, ba); + Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, ea); + Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); + trainmodel = removeStatementsWithObject(trainmodel, residue); - // Output results - trainmodel.write(out, "RDF/XML"); - // Important - free up resources used running the query - out.close(); - } - catch (IOException e) - { - System.err.println("OutputStream konnte nicht geschlossen werden!"); - } - + try + { + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ + createConfFile(dir, date, rdfFile, trainmodel); + PrintStream out = new PrintStream (new File(rdfFilePath)); + + // Output results + trainmodel.write(out, "RDF/XML"); + + // Important - free up resources used running the query + out.close(); + } + catch (IOException e) + { + System.err.println("OutputStream konnte nicht geschlossen werden!"); + } + + } + /* * load RDF file and perform learn algorithm for every .conf-file */ @@ -191,34 +259,41 @@ */ } - private static PdbRdfModel getRdfModelForIds(String[] pdbIDs) { + private static PdbRdfModel getRdfModelForIds(String pdbID ,String chainID) { - // i is an Iterator over an XML InputSource + /* + * i is an Iterator over an XML InputSource + */ + String[] pdbIDs = {pdbID}; Pdb2RdfInputIterator i = new PdbsIterator(pdbIDs); PdbXmlParser parser = new PdbXmlParser(); PdbRdfModel allmodels = new PdbRdfModel(); - try { - - while (i.hasNext()) { + while (i.hasNext()) + { final InputSource input = i.next(); PdbRdfModel model = parser.parse(input, new PdbRdfModel()); - // jedes Model muss gleich nach den relevanten Daten durchsucht werden, - // da ansonsten Probleme mit der Speichergröße auftreten können. - allmodels.add(getData(model)); - - } - } catch (IOException e) { + /* + * jedes Model muss gleich nach den relevanten Daten durchsucht werden, + * da ansonsten Probleme mit der Speichergröße auftreten können. + */ + allmodels.add(getData(model, pdbID, chainID)); + } + } + catch (IOException e) + { // TODO Auto-generated catch block e.printStackTrace(); - } catch (SAXException e) { + } + catch (SAXException e) + { // TODO Auto-generated catch block e.printStackTrace(); } return allmodels; } - private static PdbRdfModel getData(PdbRdfModel model) { + private static PdbRdfModel getData(PdbRdfModel model, String pdbID, String chainID) { // Beispiel einer SELECT Abfrage /* String selectQuery = @@ -234,30 +309,70 @@ // CONSTRUCT Abfrage PdbRdfModel construct = new PdbRdfModel(); - /* i do it kind of difficult, but i want to be certain that i only get the sequences of + /* + * i do it kind of difficult, but i want to be certain that i only get the sequences of * Polypeptides(L) which contain at least one Helix. Furthermore i collect the information * about at which position helices begin and end. - * NOTE: this information has to be removed before oututing the model. But i will use this + * NOTE: this information has to be removed before outputing the model. But i will use this * to check for positive and negative train amino acids */ - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + /* + * ich brauche noch die selektion der chain und die info über den genursprungsorganismus + * rdf:resource="http://bio2rdf.org/pdb:3LQH/chain_A" + * http://bio2rdf.org/pdb:3LQH/chain_A/position_1596 + */ + + String queryString = ""; + + if (chainID.length() != 1 || pdbID.length() != 4) + { + queryString = + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "PREFIX dcterms: <http://purl.org/dc/terms/> " + + "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + + "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + + "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + + " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + + " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + // with the Optional clause i get the information by which amino acid + // a amino acid is followed + " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + } + else + { + queryString = + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "PREFIX dcterms: <http://purl.org/dc/terms/> " + + "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + + "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + + "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + + " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + + " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:hasChainPosition> ?x8 ." + + " ?x8 <http://purl.org/dc/terms/isPartOf> <http://bio2rdf.org/pdb:" + pdbID.toUpperCase() + + "/chain_" + chainID.toUpperCase() + "> ." + + // with the Optional clause i get the information by which amino acid + // a amino acid is followed + " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + } + System.out.println(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, model); construct.add(qe.execConstruct()); @@ -394,11 +509,11 @@ } - private static void createConfFile(String date, String rdffile, PdbRdfModel model){ + private static void createConfFile(String dir, String date, String rdffile, PdbRdfModel model){ try { // the file with all amino acids - String pdbname = saveDir + "pdb" + date + ".conf"; + String pdbname = dir + "pdb" + date + ".conf"; confFileForAll = new File(pdbname); PrintStream out = new PrintStream (confFileForAll); // add import statements @@ -407,27 +522,27 @@ out.println(); HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); - resConfFiles.put(ala, new File(saveDir + ala.getLocalName() + date + ".conf")); - resConfFiles.put(cys, new File(saveDir + cys.getLocalName() + date + ".conf")); - resConfFiles.put(asp, new File(saveDir + asp.getLocalName() + date + ".conf")); - resConfFiles.put(glu, new File(saveDir + glu.getLocalName() + date + ".conf")); - resConfFiles.put(phe, new File(saveDir + phe.getLocalName() + date + ".conf")); - resConfFiles.put(gly, new File(saveDir + gly.getLocalName() + date + ".conf")); - resConfFiles.put(his, new File(saveDir + his.getLocalName() + date + ".conf")); - resConfFiles.put(ile, new File(saveDir + ile.getLocalName() + date + ".conf")); - resConfFiles.put(lys, new File(saveDir + lys.getLocalName() + date + ".conf")); - resConfFiles.put(leu, new File(saveDir + leu.getLocalName() + date + ".conf")); - resConfFiles.put(met, new File(saveDir + met.getLocalName() + date + ".conf")); - resConfFiles.put(asn, new File(saveDir + asn.getLocalName() + date + ".conf")); - resConfFiles.put(pro, new File(saveDir + pro.getLocalName() + date + ".conf")); - resConfFiles.put(gln, new File(saveDir + gln.getLocalName() + date + ".conf")); - resConfFiles.put(arg, new File(saveDir + arg.getLocalName() + date + ".conf")); - resConfFiles.put(ser, new File(saveDir + ser.getLocalName() + date + ".conf")); - resConfFiles.put(thr, new File(saveDir + thr.getLocalName() + date + ".conf")); - resConfFiles.put(val, new File(saveDir + val.getLocalName() + date + ".conf")); - resConfFiles.put(trp, new File(saveDir + trp.getLocalName() + date + ".conf")); - resConfFiles.put(tyr, new File(saveDir + tyr.getLocalName() + date + ".conf")); - resConfFiles.put(sel, new File(saveDir + sel.getLocalName() + date + ".conf")); + resConfFiles.put(ala, new File(dir + ala.getLocalName() + date + ".conf")); + resConfFiles.put(cys, new File(dir + cys.getLocalName() + date + ".conf")); + resConfFiles.put(asp, new File(dir + asp.getLocalName() + date + ".conf")); + resConfFiles.put(glu, new File(dir + glu.getLocalName() + date + ".conf")); + resConfFiles.put(phe, new File(dir + phe.getLocalName() + date + ".conf")); + resConfFiles.put(gly, new File(dir + gly.getLocalName() + date + ".conf")); + resConfFiles.put(his, new File(dir + his.getLocalName() + date + ".conf")); + resConfFiles.put(ile, new File(dir + ile.getLocalName() + date + ".conf")); + resConfFiles.put(lys, new File(dir + lys.getLocalName() + date + ".conf")); + resConfFiles.put(leu, new File(dir + leu.getLocalName() + date + ".conf")); + resConfFiles.put(met, new File(dir + met.getLocalName() + date + ".conf")); + resConfFiles.put(asn, new File(dir + asn.getLocalName() + date + ".conf")); + resConfFiles.put(pro, new File(dir + pro.getLocalName() + date + ".conf")); + resConfFiles.put(gln, new File(dir + gln.getLocalName() + date + ".conf")); + resConfFiles.put(arg, new File(dir + arg.getLocalName() + date + ".conf")); + resConfFiles.put(ser, new File(dir + ser.getLocalName() + date + ".conf")); + resConfFiles.put(thr, new File(dir + thr.getLocalName() + date + ".conf")); + resConfFiles.put(val, new File(dir + val.getLocalName() + date + ".conf")); + resConfFiles.put(trp, new File(dir + trp.getLocalName() + date + ".conf")); + resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + date + ".conf")); + resConfFiles.put(sel, new File(dir + sel.getLocalName() + date + ".conf")); confFilePerResidue = resConfFiles; @@ -518,19 +633,16 @@ } } - private static void createArffFile(String date, String arfffile, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - String arffname = saveDir + "pdb" + date + ".arff"; + private static void createArffFile(String arffFilePath, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - - String relation = "@RELATION "; for (int i = 0; i < sets.getTrainset().length ; i++){ - System.out.println("Element " + i + "= " + sets.getTrainset()[i]); + System.out.println("Element " + i + " = " + sets.getTrainset()[i].getPdbID()); relation += sets.getTrainset()[i]; } /* - * ATTRIBUTEs + * ATTRIBUTES */ // Integer declaring Position in chain @@ -586,7 +698,8 @@ // every element in riter stands for a AA-chain start // every first amino acid indicates a new AA-chain - while (riter.hasNext()) { + while (riter.hasNext()) + { // Initialization of variables needed int i = 0; Resource aaOne = riter.nextResource(); @@ -604,28 +717,36 @@ // die Guten ins Töpfchen ... // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-helix - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ){ + if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ) + { inHelix = true; } // die Schlechten ins Kröpfchen // if we get an non-empty iterator for pdb:endsAt and are already within a AA-helix // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix){ + if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix) + { inHelix = false; } // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()){ + if (model.listObjectsOfProperty(currentaa, iib).hasNext()) + { nextaa = model.getProperty(currentaa, iib).getResource(); } // add current amino acid to positives or negatives set - while(resType.hasNext()){ + while(resType.hasNext()) + { Resource aaType = resType.next().asResource(); System.out.println(aaType.getURI()); - if (resdata.get(aaType) != null){ - if (inHelix){ + if (resdata.get(aaType) != null) + { + if (inHelix) + { data += i + "," + 1 + "," + resdata.get(aaType); - } else { + } + else + { data += i + "," + 0 + "," + resdata.get(aaType); } } @@ -634,19 +755,18 @@ } while (currentaa.hasProperty(iib)) ; } - try{ - PrintStream out = new PrintStream (new File(arffname)); + try + { + PrintStream out = new PrintStream (new File(arffFilePath)); out.println(relation); out.print(attribute); out.print(data); out.close(); - - } catch (FileNotFoundException e ) { - System.err.println("Datei " + arffname + "konnte nicht angelegt werden!"); + } + catch (FileNotFoundException e ) + { + System.err.println("Datei " + arffFilePath + " konnte nicht angelegt werden!"); e.printStackTrace(); } - - - } -} \ No newline at end of file +} Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -0,0 +1,43 @@ +package org.dllearner.examples.pdb; + +public class PdbProtein { + + private String pdbID; + private String chainID; + private String species; + + public PdbProtein(String pdbID) { + this.pdbID = pdbID; + } + + public PdbProtein(String pdbID, String chainID) { + this.pdbID = pdbID; + this.chainID = chainID; + } + public PdbProtein() { + this.pdbID = ""; + this.chainID = ""; + } + + public String getPdbID() { + return pdbID; + } + public void setPdbID(String pdbID) { + this.pdbID = pdbID; + } + public String getChainID() { + return chainID; + } + public void setChainID(String chain) { + this.chainID = chain; + } + public String getSpecies() { + return species; + } + public void setSpecies(String species) { + this.species = species; + } + + + +} Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-11 12:44:11 UTC (rev 2790) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -1,5 +1,7 @@ package org.dllearner.examples.pdb; +import java.io.File; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; @@ -11,29 +13,40 @@ public class TrainAndTestSet { - private String[] trainset; - private String[] testset; - private HashMap<Integer,String> setentries; - private ArrayList<String> pdbprotlines; + private PdbProtein[] trainset; - public String[] getTrainset() { + public PdbProtein[] getTrainset() { return trainset; } - - public String[] getTestset() { - return testset; + + + public TrainAndTestSet () { + String pdbID = ""; + PdbProtein[] pdbProteins = {new PdbProtein(pdbID)}; + this.trainset = pdbProteins; } public TrainAndTestSet (String pdbID) { - String[] pdbIDs = {pdbID}; - this.trainset = pdbIDs; + PdbProtein[] pdbProteins = {new PdbProtein(pdbID)}; + this.trainset = pdbProteins; } + public TrainAndTestSet (String pdbID, String chainID) { + PdbProtein[] pdbProteins = {new PdbProtein(pdbID, chainID)}; + this.trainset = pdbProteins; + } + public TrainAndTestSet (String[] pdbIDs) { - this.trainset = pdbIDs; + PdbProtein pdbProt; + PdbProtein[] pdbProteins = new PdbProtein[pdbIDs.length]; + for (int i = 0; i < pdbIDs.length; i++ ) + { + pdbProt = new PdbProtein(pdbIDs[i]); + pdbProteins[i] = pdbProt; + } + this.trainset = pdbProteins; } - public TrainAndTestSet (int setsize) { // we read in the online file with all PDB-entries @@ -41,14 +54,12 @@ try { pdbEntryType = new URL("ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt"); LineNumberReader pdbproteins = new LineNumberReader(new InputStreamReader(pdbEntryType.openStream())); - - // we calculate the number of lines in that file and - // read all lines into the global variable pdbprotlines - int linenr = this.getNumberOfLines(pdbproteins); + // read all lines in lines + ArrayList<String> lines = this.readInFile(pdbproteins); pdbproteins.close(); - - - // System.out.println("PDB Prot File has "+linenr+" lines." ); + // get number of lines + int linenr = lines.size(); + System.out.println("PDB Prot File has "+linenr+" lines." ); // handling of incorrect setsize values if ((2*setsize) >= linenr) { @@ -59,9 +70,7 @@ } // lets create Train- and Testset - this.trainset = this.createSet(setsize, linenr); - this.testset = this.createSet(setsize, linenr); - + this.trainset = this.createSet(setsize, linenr, lines); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -71,77 +80,84 @@ } } - // this method counts the number of lines in the read in file and - // fills pdbprotlines with content - private int getNumberOfLines (LineNumberReader lnr) { - try { - int count = 0; - ArrayList<String> arraylist = new ArrayList<String>(); - String line; - - - while ((line = lnr.readLine()) != null) { - arraylist.add(count, line); - count++; + + public TrainAndTestSet (File pdbIDlist) { + try + { + LineNumberReader pdbproteins = new LineNumberReader(new FileReader(pdbIDlist)); + ArrayList<String> lines = this.readInFile(pdbproteins); + pdbproteins.close(); + // get number of lines + int linenr = lines.size(); + PdbProtein[] proteins = new PdbProtein[linenr]; + for (int i = 0; i < linenr; i++) + { + proteins[i].setPdbID(getpdbid(i, lines)); + proteins[i].setChainID(getChainID(i, lines)); } - this.pdbprotlines = arraylist; - - return count; + this.trainset = proteins; } - catch (IOException e) { + catch (IOException e) + { // TODO Auto-generated catch block e.printStackTrace(); - return 0; } + } -/* private void createArrayList(int linenumber){ + private ArrayList<String> readInFile (LineNumberReader lnr) { + ArrayList<String> arraylist = new ArrayList<String>(); try { - ArrayList<String> arraylist = new ArrayList<String>(); - LineNumberReader lnr = new LineNumberReader(new FileReader(this.pdbproteins)); - for (int i = 0; i < linenumber; i++) { - String line = lnr.readLine(); - arraylist.add(i, line); - // System.out.println("Line "+ i +": "+ line); + String line; + while ((line = lnr.readLine()) != null) + { + arraylist.add(line); } - this.pdbprotlines = arraylist; - } catch (FileNotFoundException e) { + } + catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); } - + return arraylist; } - */ //creates Sets of PDB IDs equal to setsize - private String [] createSet(int setsize, int linenr){ - String [] set = new String [setsize]; - if (this.setentries == null) { - this.setentries = new HashMap<Integer,String>(2*setsize); - } - HashMap<Integer,String> setmap = this.setentries; + private PdbProtein[] createSet(int setsize, int linenr, ArrayList<String> lines){ + + PdbProtein[] set = new PdbProtein[setsize]; + HashMap<Integer,String> setmap = new HashMap<Integer,String>(2*setsize); + Random gen = new Random(); for (int i = 0; i < setsize; i++) { int lnr = gen.nextInt(linenr); while (setmap.containsKey(Integer.valueOf(lnr))) { lnr = gen.nextInt(linenr); } - set[i] = this.getpdbid(lnr); - setmap.put(Integer.valueOf(lnr), set[i]); + set[i].setPdbID(this.getpdbid(lnr, lines)); + setmap.put(Integer.valueOf(lnr), set[i].getPdbID()); } - this.setentries = setmap; return set; } - private String getpdbid (int lineNumber) { + private String getpdbid (int lineNumber, ArrayList<String> lines ) { // Initialize a LineNumberReader - ArrayList<String> arraylist = pdbprotlines; - String line =(String) arraylist.get(lineNumber); + String line =(String) lines.get(lineNumber); String pdb_id = line.substring(0, 4); return pdb_id; } + + private String getChainID (int lineNumber, ArrayList<String> lines) { + String line =(String) lines.get(lineNumber); + String chainID; + if (line.length() > 4) + { + chainID = line.substring(5, 7); + } + else + { + chainID = ""; + } + return chainID; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-05-12 15:41:52
|
Revision: 2799 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2799&view=rev Author: kmpf Date: 2011-05-12 15:41:41 +0000 (Thu, 12 May 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-12 14:17:45 UTC (rev 2798) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-12 15:41:41 UTC (rev 2799) @@ -1,8 +1,11 @@ package org.dllearner.examples.pdb; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.LineNumberReader; import java.io.PrintStream; import java.text.SimpleDateFormat; @@ -71,10 +74,6 @@ private static ArrayList<Resource> positives; private static ArrayList<Resource> negatives; - public void setPositives(ArrayList<Resource> pos){ - positives = pos; - } - public void setNegatives(ArrayList<Resource> neg){ negatives = neg; } @@ -91,22 +90,43 @@ private static HashMap<Resource, File> confFilePerResidue; private static File confFileForAll; + public void setPositives(ArrayList<Resource> pos){ + positives = pos; + } + /** * @param args * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { - Boolean test = true; + Boolean test = false; + Boolean rdfConf = true; + Boolean arff = false; + /* + * save: saves the whole trainset into the memory file + * load: loads the whole memory file into the trainset + */ + Boolean save = true; + Boolean load = false; + File memory = new File(dataDir + "memory.txt"); + Boolean dlLearn = false; + Boolean wekaLearn = false; + + int dataSet = 1; /* - * get dataset files + * get data set files */ + // data set 1 String bt426 = dataDir + "bt426.list"; File bt426List = new File(bt426); + // data set 2 String plp273 = dataDir + "plp273.list"; File plp273List = new File(plp273); + // data set 3 String plp364 = dataDir + "plp364.list"; File plp364List = new File(plp364); + // data set 4 String plp399 = dataDir + "plp399.list"; File plp399List = new File(plp399); @@ -114,7 +134,7 @@ * data for test purpose */ String pdbID = "3LQH"; - String chainID = ""; + String chainID = "A"; /* * generate trainset and fill trainmodel @@ -122,21 +142,29 @@ PdbRdfModel trainmodel = new PdbRdfModel(); TrainAndTestSet trainSet = new TrainAndTestSet(); - if (test) + if (test && !load ) { trainSet = new TrainAndTestSet(pdbID, chainID); } - else + + if ( !test && !load ) { switch (dataSet) { case 1: trainSet = new TrainAndTestSet(bt426List); break; case 2: trainSet = new TrainAndTestSet(plp273List); break; case 3: trainSet = new TrainAndTestSet(plp364List); break; case 4: trainSet = new TrainAndTestSet(plp399List); break; - } + } } - + if(load && memory.canRead()) + { + System.out.println("Hier!"); + trainSet = new TrainAndTestSet(memory); + } + + + /* * generate a PdbRdfModel for every pdbID */ @@ -145,6 +173,8 @@ for (int i = 0; i < trainSet.getTrainset().length; i++) { String[] pdbIDs = {trainSet.getTrainset()[i].getPdbID()}; + System.out.println("pdbId: " + trainSet.getTrainset()[i].getPdbID()); + System.out.println("chainID: " + trainSet.getTrainset()[i].getChainID()); trainmodel.removeAll(); trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); @@ -153,7 +183,7 @@ * amino acid of every chain, they are returned within a ResIterator */ ResIterator niter = getFirstAA(trainmodel); - ResIterator riter = niter; + /* * we add some distance Information to our model @@ -165,22 +195,19 @@ * global positives ArrayList, and all others in the global negatives ArrayList */ createPositivesAndNegatives(niter, trainmodel); - - SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH_mm"); - String date = df.format(new Date()); String rdfFile; String arffFile; if (trainSet.getTrainset()[i].getChainID().length() == 0) { - rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".rdf"; - arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".arff"; + rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".rdf"; + arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".arff"; } else { rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".rdf"; + + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".rdf"; arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".arff"; + + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".arff"; } String dir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; File directory = new File(dir); @@ -188,8 +215,16 @@ String rdfFilePath = dir + rdfFile; String arffFilePath = dir + arffFile; - createArffFile(arffFilePath, trainmodel, trainSet, riter); /* + * if arff = true create pdbID.arff files + */ + if (arff) + { + niter = getFirstAA(trainmodel); + createArffFile(arffFilePath, trainmodel, trainSet, niter); + } + + /* * remove all triples that contain information about begin and end of helices */ Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); @@ -199,64 +234,104 @@ Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); trainmodel = removeStatementsWithObject(trainmodel, residue); + + /* + * if rdfConf = true create pdbID.rdf and *.conf files + */ + if(rdfConf) + { + try + { + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ + createConfFile(dir, rdfFile, trainmodel); + PrintStream out = new PrintStream (new File(rdfFilePath)); + + // Output results + trainmodel.write(out, "RDF/XML"); - try - { + // Important - free up resources used running the query + out.close(); + } + catch (IOException e) + { + System.err.println("OutputStream konnte nicht geschlossen werden!"); + } + } + + if(dlLearn) + { + File filename = confFileForAll; + /* - * creatConfFile() - * writes the conf-Files and saves there File-objects in: - * confFileForAll and confFilePerResidue - */ - createConfFile(dir, date, rdfFile, trainmodel); - PrintStream out = new PrintStream (new File(rdfFilePath)); + * load RDF file and perform learn algorithm for every .conf-file + */ + ComponentManager cm = ComponentManager.getInstance(); + KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); + cm.applyConfigEntry(ks, "url","file://" + filename ); + ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); + try { + rc.init(); + } catch (ComponentInitException e1) { + e1.printStackTrace(); + } + Start start = null; + Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); + while ( aa.hasNext() ){ + Resource nextRes = aa.next(); + System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); + try{ + start = new Start(confFilePerResidue.get(nextRes)); + } catch (ComponentInitException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (org.dllearner.confparser.ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + start.start(false); + Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); + System.out.println(d.toKBSyntaxString()); + } + } + + if(wekaLearn) + { - // Output results - trainmodel.write(out, "RDF/XML"); - - // Important - free up resources used running the query - out.close(); - } - catch (IOException e) - { - System.err.println("OutputStream konnte nicht geschlossen werden!"); - } - + } } - /* - * load RDF file and perform learn algorithm for every .conf-file - */ - - - /* - ComponentManager cm = ComponentManager.getInstance(); - KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); - cm.applyConfigEntry(ks, "url","file://" + filename ); - ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); - rc.init(); - */ - /* - Start start = null; - Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); - while ( aa.hasNext() ){ - Resource nextRes = aa.next(); - System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); - try{ - start = new Start(confFilePerResidue.get(nextRes)); - } catch (ComponentInitException e) { + if(save) + { + String infos = ""; + for(int i=0; i < trainSet.getTrainset().length; i++) + { + infos = trainSet.getTrainset()[i].getPdbID() + "." + + trainSet.getTrainset()[i].getChainID() + "." + + trainSet.getTrainset()[i].getSpecies(); + if(i != trainSet.getTrainset().length - 1) + { + infos += "\n"; + } + } + + try + { + PrintStream out = new PrintStream (memory); + out.println(infos); + out.close(); + } + catch(FileNotFoundException e) + { e.printStackTrace(); - } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (org.dllearner.confparser.ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); } - start.start(false); - Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); - System.out.println(d.toKBSyntaxString()); + } - */ } private static PdbRdfModel getRdfModelForIds(String pdbID ,String chainID) { @@ -509,11 +584,11 @@ } - private static void createConfFile(String dir, String date, String rdffile, PdbRdfModel model){ + private static void createConfFile(String dir, String rdffile, PdbRdfModel model){ try { // the file with all amino acids - String pdbname = dir + "pdb" + date + ".conf"; + String pdbname = dir + "pdb" + ".conf"; confFileForAll = new File(pdbname); PrintStream out = new PrintStream (confFileForAll); // add import statements @@ -522,27 +597,27 @@ out.println(); HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); - resConfFiles.put(ala, new File(dir + ala.getLocalName() + date + ".conf")); - resConfFiles.put(cys, new File(dir + cys.getLocalName() + date + ".conf")); - resConfFiles.put(asp, new File(dir + asp.getLocalName() + date + ".conf")); - resConfFiles.put(glu, new File(dir + glu.getLocalName() + date + ".conf")); - resConfFiles.put(phe, new File(dir + phe.getLocalName() + date + ".conf")); - resConfFiles.put(gly, new File(dir + gly.getLocalName() + date + ".conf")); - resConfFiles.put(his, new File(dir + his.getLocalName() + date + ".conf")); - resConfFiles.put(ile, new File(dir + ile.getLocalName() + date + ".conf")); - resConfFiles.put(lys, new File(dir + lys.getLocalName() + date + ".conf")); - resConfFiles.put(leu, new File(dir + leu.getLocalName() + date + ".conf")); - resConfFiles.put(met, new File(dir + met.getLocalName() + date + ".conf")); - resConfFiles.put(asn, new File(dir + asn.getLocalName() + date + ".conf")); - resConfFiles.put(pro, new File(dir + pro.getLocalName() + date + ".conf")); - resConfFiles.put(gln, new File(dir + gln.getLocalName() + date + ".conf")); - resConfFiles.put(arg, new File(dir + arg.getLocalName() + date + ".conf")); - resConfFiles.put(ser, new File(dir + ser.getLocalName() + date + ".conf")); - resConfFiles.put(thr, new File(dir + thr.getLocalName() + date + ".conf")); - resConfFiles.put(val, new File(dir + val.getLocalName() + date + ".conf")); - resConfFiles.put(trp, new File(dir + trp.getLocalName() + date + ".conf")); - resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + date + ".conf")); - resConfFiles.put(sel, new File(dir + sel.getLocalName() + date + ".conf")); + resConfFiles.put(ala, new File(dir + ala.getLocalName() + ".conf")); + resConfFiles.put(cys, new File(dir + cys.getLocalName() + ".conf")); + resConfFiles.put(asp, new File(dir + asp.getLocalName() + ".conf")); + resConfFiles.put(glu, new File(dir + glu.getLocalName() + ".conf")); + resConfFiles.put(phe, new File(dir + phe.getLocalName() + ".conf")); + resConfFiles.put(gly, new File(dir + gly.getLocalName() + ".conf")); + resConfFiles.put(his, new File(dir + his.getLocalName() + ".conf")); + resConfFiles.put(ile, new File(dir + ile.getLocalName() + ".conf")); + resConfFiles.put(lys, new File(dir + lys.getLocalName() + ".conf")); + resConfFiles.put(leu, new File(dir + leu.getLocalName() + ".conf")); + resConfFiles.put(met, new File(dir + met.getLocalName() + ".conf")); + resConfFiles.put(asn, new File(dir + asn.getLocalName() + ".conf")); + resConfFiles.put(pro, new File(dir + pro.getLocalName() + ".conf")); + resConfFiles.put(gln, new File(dir + gln.getLocalName() + ".conf")); + resConfFiles.put(arg, new File(dir + arg.getLocalName() + ".conf")); + resConfFiles.put(ser, new File(dir + ser.getLocalName() + ".conf")); + resConfFiles.put(thr, new File(dir + thr.getLocalName() + ".conf")); + resConfFiles.put(val, new File(dir + val.getLocalName() + ".conf")); + resConfFiles.put(trp, new File(dir + trp.getLocalName() + ".conf")); + resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + ".conf")); + resConfFiles.put(sel, new File(dir + sel.getLocalName() + ".conf")); confFilePerResidue = resConfFiles; Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java 2011-05-12 14:17:45 UTC (rev 2798) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java 2011-05-12 15:41:41 UTC (rev 2799) @@ -6,19 +6,25 @@ private String chainID; private String species; + public PdbProtein() { + this("", "", ""); + } + public PdbProtein(String pdbID) { - this.pdbID = pdbID; + this(pdbID, "", ""); } public PdbProtein(String pdbID, String chainID) { + this(pdbID, chainID, ""); + } + + public PdbProtein(String pdbID, String chainID, String species) { this.pdbID = pdbID; this.chainID = chainID; + this.species = species; } - public PdbProtein() { - this.pdbID = ""; - this.chainID = ""; - } + public String getPdbID() { return pdbID; } @@ -37,7 +43,4 @@ public void setSpecies(String species) { this.species = species; } - - - } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-12 14:17:45 UTC (rev 2798) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-12 15:41:41 UTC (rev 2799) @@ -22,7 +22,9 @@ public TrainAndTestSet () { String pdbID = ""; - PdbProtein[] pdbProteins = {new PdbProtein(pdbID)}; + String chainID = ""; + String species = ""; + PdbProtein[] pdbProteins = {new PdbProtein(pdbID, chainID, species)}; this.trainset = pdbProteins; } @@ -36,6 +38,11 @@ this.trainset = pdbProteins; } + public TrainAndTestSet (String pdbID, String chainID, String species) { + PdbProtein[] pdbProteins = {new PdbProtein(pdbID, chainID, species)}; + this.trainset = pdbProteins; + } + public TrainAndTestSet (String[] pdbIDs) { PdbProtein pdbProt; PdbProtein[] pdbProteins = new PdbProtein[pdbIDs.length]; @@ -85,20 +92,22 @@ try { LineNumberReader pdbproteins = new LineNumberReader(new FileReader(pdbIDlist)); - ArrayList<String> lines = this.readInFile(pdbproteins); + ArrayList<String> lines = this.readInFile(pdbproteins); pdbproteins.close(); // get number of lines int linenr = lines.size(); + System.out.println("File "+ pdbIDlist.getCanonicalPath() + " has " + linenr + " lines."); PdbProtein[] proteins = new PdbProtein[linenr]; for (int i = 0; i < linenr; i++) { - proteins[i].setPdbID(getpdbid(i, lines)); - proteins[i].setChainID(getChainID(i, lines)); + System.out.println("LINES element " + i + " contains " + lines.get(i)); + proteins[i] = new PdbProtein(getPdbID(i, lines), getChainID(i, lines), getSpecies(i, lines)); } this.trainset = proteins; } catch (IOException e) { + System.err.println("File " + pdbIDlist.getAbsolutePath() + " could not be read in!"); // TODO Auto-generated catch block e.printStackTrace(); } @@ -133,26 +142,34 @@ while (setmap.containsKey(Integer.valueOf(lnr))) { lnr = gen.nextInt(linenr); } - set[i].setPdbID(this.getpdbid(lnr, lines)); + set[i].setPdbID(this.getPdbID(lnr, lines)); setmap.put(Integer.valueOf(lnr), set[i].getPdbID()); } return set; } - private String getpdbid (int lineNumber, ArrayList<String> lines ) { + private String getPdbID (int lineNumber, ArrayList<String> lines ) { // Initialize a LineNumberReader String line =(String) lines.get(lineNumber); - String pdb_id = line.substring(0, 4); - return pdb_id; + String pdbID; + if ( line.length() >= 4 ) + { + pdbID = line.substring(0, line.indexOf(".")); + } + else + { + pdbID = ""; + } + return pdbID; } private String getChainID (int lineNumber, ArrayList<String> lines) { String line =(String) lines.get(lineNumber); String chainID; - if (line.length() > 4) + if (line.contains(".") ) { - chainID = line.substring(5, 7); + chainID = line.substring(line.indexOf(".") + 1, line.lastIndexOf(".")); } else { @@ -160,4 +177,18 @@ } return chainID; } + + private String getSpecies (int lineNumber, ArrayList<String> lines) { + String line =(String) lines.get(lineNumber); + String species; + if (line.length() > 6) + { + species = line.substring(line.lastIndexOf(".")); + } + else + { + species = ""; + } + return species; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-06-23 08:48:25
|
Revision: 2927 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2927&view=rev Author: kmpf Date: 2011-06-23 08:48:18 +0000 (Thu, 23 Jun 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Added Paths: ----------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/ArffFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/DirectoryFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/ArffFileFilter.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/ArffFileFilter.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/ArffFileFilter.java 2011-06-23 08:48:18 UTC (rev 2927) @@ -0,0 +1,14 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileFilter; + +public class ArffFileFilter implements FileFilter +{ + private final String extension = new String("arff"); + + public boolean accept(File file) + { + return file.getName().toLowerCase().endsWith(extension); + } +} \ No newline at end of file Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/DirectoryFileFilter.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/DirectoryFileFilter.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/DirectoryFileFilter.java 2011-06-23 08:48:18 UTC (rev 2927) @@ -0,0 +1,17 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileFilter; + +public class DirectoryFileFilter implements FileFilter +{ + public boolean accept(File file) + { + if (file.isDirectory() && file.getName().length() == 4 && !file.getName().startsWith(".")) { + return true; + } + else { + return false; + } + } +} Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-06-23 08:17:44 UTC (rev 2926) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-06-23 08:48:18 UTC (rev 2927) @@ -3,6 +3,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; @@ -75,6 +76,7 @@ private static Resource trp = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tryptophan"); private static Resource tyr = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); private static Resource sel = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); + private static Resource hyt = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); private static ArrayList<Resource> positives; private static ArrayList<Resource> negatives; @@ -104,16 +106,26 @@ * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { - Boolean test = false; + /* + * test = true -> use test data + * test = false -> use data set 1, 2, 3 or 4 + */ + Boolean test = true; + /* + * rdfConf = true -> write out the .rdf and .conf-Files + * rdfConf = false -> does not generate those files + */ Boolean rdfConf = true; + /* + * arff = true -> write out .arff-Files + * arff = false -> does not generate those files + */ Boolean arff = false; /* - * save: saves the whole trainset into the memory file - * load: loads the whole memory file into the trainset + * load = true -> load alle .rdf, .conf and .arff Files that can be found within the directory dataDir + * load = false -> don't load anything */ - Boolean save = true; - Boolean load = false; - String savePdbInfos = ".info"; + Boolean load = true; Boolean dlLearn = false; Boolean wekaLearn = false; @@ -137,57 +149,65 @@ /* * data for test purpose */ - String pdbID = "3LQH"; + String pdbID = "1XFF"; String chainID = "A"; + File dir = new File(dataDir); /* * generate trainset and fill trainmodel */ PdbRdfModel trainmodel = new PdbRdfModel(); TrainAndTestSet trainSet = new TrainAndTestSet(); - String pdbIdInfo = ""; - String bt426Info = ""; - String plp273Info = ""; - String plp364Info = ""; - String plp399Info = ""; - - if (test && !load ) - { - trainSet = new TrainAndTestSet(pdbID, chainID); - pdbIdInfo = dataDir + pdbID + savePdbInfos; - } - if ( !test && !load ) + if ( !test ) { switch (dataSet) { case 1: trainSet = new TrainAndTestSet(bt426List); - bt426Info = dataDir + bt426 + savePdbInfos; break; case 2: trainSet = new TrainAndTestSet(plp273List); - plp273Info = dataDir + plp273 + savePdbInfos; break; case 3: trainSet = new TrainAndTestSet(plp364List); - plp364Info = dataDir + plp364 + savePdbInfos; break; case 4: trainSet = new TrainAndTestSet(plp399List); - plp399Info = dataDir + plp399 + savePdbInfos; break; } } - File memory = new File(savePdbInfos); - File bt426InfoFile = new File(bt426Info); - File plp273InfoFile = new File(plp273Info); - File plp364InfoFile = new File(plp364Info); - File plp399InfoFile = new File(plp399Info); + else + { + trainSet = new TrainAndTestSet(pdbID, chainID); + } + + HashMap<String,File> rdfFiles = new HashMap<String,File>(); + HashMap<String,File> arffFiles = new HashMap<String,File>(); - if(load && memory.canRead()) + if (load) { - System.out.println("Hier!"); - trainSet = new TrainAndTestSet(memory); + System.out.println("Starting to load files in " + dataDir ); + File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); + for (File actDir : pdbDir) { + File[] rdfFilesInActDir = actDir.listFiles(new RdfFileFilter()); + try { + System.out.println("Looking for Files in " + actDir.getCanonicalPath() ); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + for (File rdfFile : rdfFilesInActDir) { + rdfFiles.put(rdfFile.getName().substring(0, 3), rdfFile); + System.out.println("Found RDF File for PDB ID " + rdfFile.getName().substring(0, 3) ); + System.out.println("Found RDF File " + rdfFile.getName() ); + } + File[] arffFilesInActDir = actDir.listFiles(new ArffFileFilter()); + for (File arffFile : arffFilesInActDir) { + arffFiles.put(arffFile.getName().substring(0, 3), arffFile); + System.out.println("Found RDF File for PDB ID " + arffFile.getName().substring(0, 3) ); + System.out.println("Found ARFF File " + arffFile.getName() ); + } + } } @@ -197,17 +217,19 @@ */ + + for (int i = 0; i < trainSet.getTrainset().length; i++) { - String[] pdbIDs = {trainSet.getTrainset()[i].getPdbID()}; System.out.println("pdbId: " + trainSet.getTrainset()[i].getPdbID()); System.out.println("chainID: " + trainSet.getTrainset()[i].getChainID()); trainmodel.removeAll(); trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); - + /* + * extract the species the protein originates from + */ trainSet.getTrainset()[i].setSpecies(getSpecies(trainmodel, trainSet.getTrainset()[i].getPdbID())); - /* * as we have sometimes to handle several amino acid chains we need the first @@ -215,9 +237,8 @@ */ ResIterator niter = getFirstAA(trainmodel); - /* - * we add some distance Information to our model + * we add the information of which amino acid is the fourth predecessor of which other amino acid */ trainmodel = addDistanceInfo(trainmodel); @@ -240,11 +261,11 @@ arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".arff"; } - String dir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; - File directory = new File(dir); + String pdbDir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; + File directory = new File(pdbDir); directory.mkdir(); - String rdfFilePath = dir + rdfFile; - String arffFilePath = dir + arffFile; + String rdfFilePath = pdbDir + rdfFile; + String arffFilePath = pdbDir + arffFile; /* * if arff = true create pdbID.arff files @@ -256,18 +277,6 @@ } /* - * remove all triples that contain information about begin and end of helices - */ - - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ba); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ea); - Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); - trainmodel = removeStatementsWithObject(trainmodel, residue); - - - /* * if rdfConf = true create pdbID.rdf and *.conf files */ if(rdfConf) @@ -279,7 +288,7 @@ * writes the conf-Files and saves there File-objects in: * confFileForAll and confFilePerResidue */ - createConfFile(dir, rdfFile, trainmodel); + createConfFile(pdbDir, rdfFile, trainmodel); PrintStream out = new PrintStream (new File(rdfFilePath)); // Output results @@ -293,77 +302,86 @@ System.err.println("OutputStream konnte nicht geschlossen werden!"); } } + /* + * remove all triples that contain information about begin and end of helices + */ - if(dlLearn) - { - File filename = confFileForAll; - - /* - * load RDF file and perform learn algorithm for every .conf-file - */ - /*ComponentManager cm = ComponentManager.getInstance(); - KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); - cm.applyConfigEntry(ks, "url","file://" + filename ); - ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); + Property beginsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, beginsAt); + Property endsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, endsAt); + Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); + trainmodel = removeStatementsWithObject(trainmodel, residue); + } + + /* + * write out the files that contain information about which proteins originate from which species + */ + HashMap<String, File> proteinsOfSpecies = new HashMap<String, File>(); + for (int i = 0; i < trainSet.getTrainset().length; i++){ + if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()) == null){ + File speciesProteins = new File(dataDir + trainSet.getTrainset()[i].getSpecies() + ".pos"); + proteinsOfSpecies.put(trainSet.getTrainset()[i].getSpecies(), speciesProteins); + } + if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()).canWrite()) { try { - rc.init(); - } catch (ComponentInitException e1) { - e1.printStackTrace(); + FileWriter out = new FileWriter(proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()), true); + String line = trainSet.getTrainset()[i].getPdbID() + "." + + trainSet.getTrainset()[i].getChainID() + ".\n"; + out.write(line); + out.close(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); } - Start start = null; - Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); - while ( aa.hasNext() ){ - Resource nextRes = aa.next(); - System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); - try{ - start = new Start(confFilePerResidue.get(nextRes)); - } catch (ComponentInitException e) { - e.printStackTrace(); - } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (org.dllearner.confparser.ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - start.start(false); - Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); - System.out.println(d.toKBSyntaxString()); - } - */ - } - - if(wekaLearn) - { } } - if(save) + if(dlLearn) { - String infos = ""; - for(int i=0; i < trainSet.getTrainset().length; i++) - { - infos = trainSet.getTrainset()[i].getPdbID() + "." + - trainSet.getTrainset()[i].getChainID() + "." + - trainSet.getTrainset()[i].getSpecies(); - if(i != trainSet.getTrainset().length - 1) - { - infos += "\n"; - } - } + - try - { - PrintStream out = new PrintStream (memory); - out.println(infos); - out.close(); + /* + * load RDF file and perform learn algorithm for every .conf-file + */ + /*ComponentManager cm = ComponentManager.getInstance(); + KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); + cm.applyConfigEntry(ks, "url","file://" + filename ); + ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); + try { + rc.init(); + } catch (ComponentInitException e1) { + e1.printStackTrace(); } - catch(FileNotFoundException e) - { - e.printStackTrace(); + Start start = null; + Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); + while ( aa.hasNext() ){ + Resource nextRes = aa.next(); + System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); + try{ + start = new Start(confFilePerResidue.get(nextRes)); + } catch (ComponentInitException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (org.dllearner.confparser.ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + start.start(false); + Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); + System.out.println(d.toKBSyntaxString()); } + */ + } + if(wekaLearn) + { } } @@ -700,6 +718,7 @@ resConfFiles.put(trp, new File(dir + trp.getLocalName() + ".conf")); resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + ".conf")); resConfFiles.put(sel, new File(dir + sel.getLocalName() + ".conf")); + resConfFiles.put(hyt, new File(dir + hyt.getLocalName() + ".conf")); confFilePerResidue = resConfFiles; Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java 2011-06-23 08:48:18 UTC (rev 2927) @@ -0,0 +1,15 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileFilter; + +public class RdfFileFilter implements FileFilter +{ + private final String extension = new String("rdf"); + + public boolean accept(File file) + { + return file.getName().toLowerCase().endsWith(extension); + } +} + Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-06-23 08:17:44 UTC (rev 2926) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-06-23 08:48:18 UTC (rev 2927) @@ -148,6 +148,10 @@ return set; } + private String[] pdbEntry(int linenr, ArrayList<String> lines){ + String line =(String) lines.get(linenr); + return line.split("\t"); + } private String getPdbID (int lineNumber, ArrayList<String> lines ) { // Initialize a LineNumberReader This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-10-26 07:52:41
|
Revision: 3320 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3320&view=rev Author: kmpf Date: 2011-10-26 07:52:30 +0000 (Wed, 26 Oct 2011) Log Message: ----------- Added a few classes for hopefully nicer code. Especially the two classes for DL-Learner and Weka code. Even though the Weka thing does not work at the moment. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java Added Paths: ----------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBDLLearner.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBProtein.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java Removed Paths: ------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -0,0 +1,121 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.HashMap; + +import com.hp.hpl.jena.rdf.model.Resource; +import com.hp.hpl.jena.rdf.model.ResourceFactory; + +public final class AminoAcids { + public static final Resource ALA = ResourceFactory.createResource("http://bio2rdf.org/pdb:Alanine"); + public static final Resource CYS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Cysteine"); + public static final Resource ASP = ResourceFactory.createResource("http://bio2rdf.org/pdb:AsparticAcid"); + public static final Resource GLU = ResourceFactory.createResource("http://bio2rdf.org/pdb:GlutamicAcid"); + public static final Resource PHE = ResourceFactory.createResource("http://bio2rdf.org/pdb:Phenylalanine"); + public static final Resource GLY = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glycine"); + public static final Resource HIS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Histidine"); + public static final Resource ILE = ResourceFactory.createResource("http://bio2rdf.org/pdb:Isoleucine"); + public static final Resource LYS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Lysine"); + public static final Resource LEU = ResourceFactory.createResource("http://bio2rdf.org/pdb:Leucine"); + public static final Resource MET = ResourceFactory.createResource("http://bio2rdf.org/pdb:Methionine"); + public static final Resource ASN = ResourceFactory.createResource("http://bio2rdf.org/pdb:Asparagine"); + public static final Resource PRO = ResourceFactory.createResource("http://bio2rdf.org/pdb:Proline"); + public static final Resource GLN = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glutamine"); + public static final Resource ARG = ResourceFactory.createResource("http://bio2rdf.org/pdb:Arginine"); + public static final Resource SER = ResourceFactory.createResource("http://bio2rdf.org/pdb:Serine"); + public static final Resource THR = ResourceFactory.createResource("http://bio2rdf.org/pdb:Threonine"); + public static final Resource VAL = ResourceFactory.createResource("http://bio2rdf.org/pdb:Valine"); + public static final Resource TRP = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tryptophan"); + public static final Resource TYR = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); + public static final Resource SEL = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); + public static final Resource HYT = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); + + public static HashMap<Resource, File> getAllConfFiles (String dir, String confFileName){ + HashMap<Resource, File> aminoAcidsConfFiles = new HashMap<Resource, File>(30); + aminoAcidsConfFiles.put(ALA, new File(dir + confFileName.replace(".conf", "." + ALA.getLocalName()) + ".conf")); + aminoAcidsConfFiles.put(CYS, new File(dir + confFileName.replace(".conf", "." + CYS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ASP, new File(dir + confFileName.replace(".conf", "." + ASP.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLU, new File(dir + confFileName.replace(".conf", "." + GLU.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(PHE, new File(dir + confFileName.replace(".conf", "." + PHE.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLY, new File(dir + confFileName.replace(".conf", "." + GLY.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(HIS, new File(dir + confFileName.replace(".conf", "." + HIS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ILE, new File(dir + confFileName.replace(".conf", "." + ILE.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(LYS, new File(dir + confFileName.replace(".conf", "." + LYS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(LEU, new File(dir + confFileName.replace(".conf", "." + LEU.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(MET, new File(dir + confFileName.replace(".conf", "." + MET.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ASN, new File(dir + confFileName.replace(".conf", "." + ASN.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(PRO, new File(dir + confFileName.replace(".conf", "." + PRO.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLN, new File(dir + confFileName.replace(".conf", "." + GLN.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ARG, new File(dir + confFileName.replace(".conf", "." + ARG.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(SER, new File(dir + confFileName.replace(".conf", "." + SER.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(THR, new File(dir + confFileName.replace(".conf", "." + THR.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(VAL, new File(dir + confFileName.replace(".conf", "." + VAL.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(TRP, new File(dir + confFileName.replace(".conf", "." + TRP.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(TYR, new File(dir + confFileName.replace(".conf", "." + TYR.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(SEL, new File(dir + confFileName.replace(".conf", "." + SEL.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(HYT, new File(dir + confFileName.replace(".conf", "." + HYT.getLocalName() + ".conf"))); + return aminoAcidsConfFiles; + } + + public static HashMap<Resource, PrintStream> getAminoAcidPrintStreamMap(HashMap<Resource, File> allConfFiles){ + // put all amino acid resources and the their conf-files together + HashMap<Resource, PrintStream> resprint = new HashMap<Resource, PrintStream>(30); + try{ + resprint.put(ALA, new PrintStream(allConfFiles.get(ALA))); + resprint.put(CYS, new PrintStream(allConfFiles.get(CYS))); + resprint.put(ASP, new PrintStream(allConfFiles.get(ASP))); + resprint.put(GLU, new PrintStream(allConfFiles.get(GLU))); + resprint.put(PHE, new PrintStream(allConfFiles.get(PHE))); + resprint.put(GLY, new PrintStream(allConfFiles.get(GLY))); + resprint.put(HIS, new PrintStream(allConfFiles.get(HIS))); + resprint.put(ILE, new PrintStream(allConfFiles.get(ILE))); + resprint.put(LYS, new PrintStream(allConfFiles.get(LYS))); + resprint.put(LEU, new PrintStream(allConfFiles.get(LEU))); + resprint.put(MET, new PrintStream(allConfFiles.get(MET))); + resprint.put(ASN, new PrintStream(allConfFiles.get(ASN))); + resprint.put(PRO, new PrintStream(allConfFiles.get(PRO))); + resprint.put(GLN, new PrintStream(allConfFiles.get(GLN))); + resprint.put(ARG, new PrintStream(allConfFiles.get(ARG))); + resprint.put(SER, new PrintStream(allConfFiles.get(SER))); + resprint.put(THR, new PrintStream(allConfFiles.get(THR))); + resprint.put(VAL, new PrintStream(allConfFiles.get(VAL))); + resprint.put(TRP, new PrintStream(allConfFiles.get(TRP))); + resprint.put(TYR, new PrintStream(allConfFiles.get(TYR))); + resprint.put(SEL, new PrintStream(allConfFiles.get(SEL))); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + return resprint; + } + + public static HashMap<Resource, String> getAminoAcidArffAttributeMap(){ + HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); + resdata.put(ALA, new String("2,0,0.5,?,?")); + resdata.put(CYS, new String("1,0,1,?,0")); + resdata.put(ASP, new String("0,-1,1,?,-1")); + resdata.put(GLU, new String("0,-1,2,?,-1")); + resdata.put(PHE, new String("2,0,2,1,?")); + resdata.put(GLY, new String("2,0,0.5,?,?")); + resdata.put(HIS, new String("1,1,2,1,1")); + resdata.put(ILE, new String("2,0,2,0,?")); + resdata.put(LYS, new String("1,1,2,?,1")); + resdata.put(LEU, new String("2,0,2,0,?")); + resdata.put(MET, new String("2,0,2,?,?")); + resdata.put(ASN, new String("0,0,1,?,0")); + resdata.put(PRO, new String("?,0,1,?,?")); + resdata.put(GLN, new String("0,0,2,?,0")); + resdata.put(ARG, new String("0,1,2,?,1")); + resdata.put(SER, new String("0,0,0.5,?,0")); + resdata.put(THR, new String("1,0,1,?,0,")); + resdata.put(VAL, new String("2,0,1,0,?")); + resdata.put(TRP, new String("1,0,2,1,1")); + resdata.put(TYR, new String("1,0,2,1,0")); + resdata.put(SEL, new String("?,?,?,?,?")); + + return resdata; + } + + +} Copied: trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java (from rev 3298, trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java) =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -0,0 +1,15 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileFilter; + +public class ConfFileFilter implements FileFilter +{ + private final String extension = new String("conf"); + + public boolean accept(File file) + { + return file.getName().toLowerCase().endsWith(extension); + } +} + Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-25 07:49:07 UTC (rev 3319) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -1,801 +1,389 @@ package org.dllearner.examples.pdb; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; import java.io.PrintStream; -import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Date; import java.util.HashMap; import java.util.Iterator; -import java.util.Set; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.HTMLLayout; +import org.apache.log4j.Layout; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.dllearner.cli.Start; import org.dllearner.core.ComponentInitException; import org.dllearner.core.ComponentManager; -import org.dllearner.core.AbstractKnowledgeSource; -import org.dllearner.core.AbstractReasonerComponent; +import org.dllearner.core.LearningProblemUnsupportedException; +import org.dllearner.core.ReasonerComponent; import org.dllearner.core.owl.Description; -import org.dllearner.core.owl.Individual; -import org.dllearner.kb.OWLFile; import org.dllearner.reasoning.FastInstanceChecker; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import com.hp.hpl.jena.query.Query; -import com.hp.hpl.jena.query.QueryExecution; -import com.hp.hpl.jena.query.QueryExecutionFactory; -import com.hp.hpl.jena.query.QueryFactory; -import com.hp.hpl.jena.query.QuerySolution; -import com.hp.hpl.jena.query.ResultSet; -import com.hp.hpl.jena.query.ResultSetFormatter; -import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.ResourceFactory; import com.hp.hpl.jena.rdf.model.Statement; -import com.hp.hpl.jena.rdf.model.StmtIterator; -import com.hp.hpl.jena.vocabulary.RDFS; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; -import com.dumontierlab.pdb2rdf.parser.PdbXmlParser; -import com.dumontierlab.pdb2rdf.util.Pdb2RdfInputIterator; -import com.dumontierlab.pdb2rdf.util.PdbsIterator; -import edu.stanford.nlp.io.EncodingPrintWriter.out; - - public class HelixRDFCreator { - private static Resource ala = ResourceFactory.createResource("http://bio2rdf.org/pdb:Alanine"); - private static Resource cys = ResourceFactory.createResource("http://bio2rdf.org/pdb:Cysteine"); - private static Resource asp = ResourceFactory.createResource("http://bio2rdf.org/pdb:AsparticAcid"); - private static Resource glu = ResourceFactory.createResource("http://bio2rdf.org/pdb:GlutamicAcid"); - private static Resource phe = ResourceFactory.createResource("http://bio2rdf.org/pdb:Phenylalanine"); - private static Resource gly = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glycine"); - private static Resource his = ResourceFactory.createResource("http://bio2rdf.org/pdb:Histidine"); - private static Resource ile = ResourceFactory.createResource("http://bio2rdf.org/pdb:Isoleucine"); - private static Resource lys = ResourceFactory.createResource("http://bio2rdf.org/pdb:Lysine"); - private static Resource leu = ResourceFactory.createResource("http://bio2rdf.org/pdb:Leucine"); - private static Resource met = ResourceFactory.createResource("http://bio2rdf.org/pdb:Methionine"); - private static Resource asn = ResourceFactory.createResource("http://bio2rdf.org/pdb:Asparagine"); - private static Resource pro = ResourceFactory.createResource("http://bio2rdf.org/pdb:Proline"); - private static Resource gln = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glutamine"); - private static Resource arg = ResourceFactory.createResource("http://bio2rdf.org/pdb:Arginine"); - private static Resource ser = ResourceFactory.createResource("http://bio2rdf.org/pdb:Serine"); - private static Resource thr = ResourceFactory.createResource("http://bio2rdf.org/pdb:Threonine"); - private static Resource val = ResourceFactory.createResource("http://bio2rdf.org/pdb:Valine"); - private static Resource trp = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tryptophan"); - private static Resource tyr = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); - private static Resource sel = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); - private static Resource hyt = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); + private static Logger _logger = Logger.getLogger(HelixRDFCreator.class); + private static Logger _rootLogger = Logger.getRootLogger(); - private static ArrayList<Resource> positives; - private static ArrayList<Resource> negatives; - - public void setNegatives(ArrayList<Resource> neg){ - negatives = neg; - } - - public ArrayList<Resource> getPositives(){ - return positives; - } - - public ArrayList<Resource> getNegatives(){ - return negatives; - } - - private static String dataDir = "../test/pdb/"; - private static HashMap<Resource, File> confFilePerResidue; - private static File confFileForAll; - - public void setPositives(ArrayList<Resource> pos){ - positives = pos; - } - + private static String _dataDir = "../test/pdb/"; + private static File _dir = new File(_dataDir); + /** * @param args * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { + + // create loggers (a simple logger which outputs + // its messages to the console and a log file) + + // logger 1 is the console, where we print only info messages; + // the logger is plain, i.e. does not output log level etc. + Layout layout = new PatternLayout(); + + ConsoleAppender consoleAppender = new ConsoleAppender(layout); + // setting a threshold suppresses log messages below this level; + // this means that if you want to e.g. see all trace messages on + // console, you have to set the threshold and log level to trace + // (but we recommend just setting the log level to trace and observe + // the log file) + consoleAppender.setThreshold(Level.DEBUG); + + // logger 2 is writes to a file; it records all debug messages + // (you can choose HTML or TXT) + boolean htmlLog = false; + Layout layout2 = null; + FileAppender fileAppenderNormal = null; + String fileName; + if(htmlLog) { + layout2 = new HTMLLayout(); + fileName = _dataDir + "log/log.html"; + } else { + // simple variant: layout2 = new SimpleLayout(); + layout2 = new PatternLayout("%r [%t] %-5p %c :\n%m%n\n"); + fileName = _dataDir + "log/log.txt"; + } + try { + fileAppenderNormal = new FileAppender(layout2, fileName, false); + } catch (IOException e) { + e.printStackTrace(); + } + + // add both loggers + _rootLogger.removeAllAppenders(); + _rootLogger.addAppender(consoleAppender); + _rootLogger.addAppender(fileAppenderNormal); + _rootLogger.setLevel(Level.DEBUG); + + + Boolean fasta = true; + /* - * test = true -> use test data - * test = false -> use data set 1, 2, 3 or 4 - */ - Boolean test = true; - /* * rdfConf = true -> write out the .rdf and .conf-Files * rdfConf = false -> does not generate those files */ Boolean rdfConf = true; + /* * arff = true -> write out .arff-Files * arff = false -> does not generate those files */ - Boolean arff = false; + Boolean arff = true; /* * load = true -> load alle .rdf, .conf and .arff Files that can be found within the directory dataDir * load = false -> don't load anything */ - Boolean load = true; - Boolean dlLearn = false; - Boolean wekaLearn = false; + Boolean load = false; + Boolean dlLearn = true; + Boolean wekaLearn = true; - int dataSet = 1; + int dataSet = 5; + /* - * get data set files - */ - // data set 1 - String bt426 = dataDir + "bt426.list"; - File bt426List = new File(bt426); - // data set 2 - String plp273 = dataDir + "plp273.list"; - File plp273List = new File(plp273); - // data set 3 - String plp364 = dataDir + "plp364.list"; - File plp364List = new File(plp364); - // data set 4 - String plp399 = dataDir + "plp399.list"; - File plp399List = new File(plp399); - - /* * data for test purpose */ - String pdbID = "1XFF"; - String chainID = "A"; - File dir = new File(dataDir); +// PdbProtein testProtein = new PdbProtein("1XFF"); + PDBProtein testProtein = new PDBProtein("1XFF", "A"); /* - * generate trainset and fill trainmodel + * create a training data set */ - PdbRdfModel trainmodel = new PdbRdfModel(); - TrainAndTestSet trainSet = new TrainAndTestSet(); + ProteinDataSet proteinSet; - if ( !test ) - { - switch (dataSet) { - case 1: - trainSet = new TrainAndTestSet(bt426List); - break; - case 2: - trainSet = new TrainAndTestSet(plp273List); - break; - case 3: - trainSet = new TrainAndTestSet(plp364List); - break; - case 4: - trainSet = new TrainAndTestSet(plp399List); - break; - } + switch (dataSet) { + case 1: + proteinSet = ProteinDataSet.bt426(); + break; + case 2: + proteinSet = ProteinDataSet.plp273(); + break; + case 3: + proteinSet = ProteinDataSet.plp364(); + break; + case 4: + proteinSet = ProteinDataSet.plp399(); + break; + default: + proteinSet = new ProteinDataSet(testProtein); + break; } - else - { - trainSet = new TrainAndTestSet(pdbID, chainID); - } - HashMap<String,File> rdfFiles = new HashMap<String,File>(); - HashMap<String,File> arffFiles = new HashMap<String,File>(); - - if (load) - { - System.out.println("Starting to load files in " + dataDir ); - File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); - for (File actDir : pdbDir) { - File[] rdfFilesInActDir = actDir.listFiles(new RdfFileFilter()); - try { - System.out.println("Looking for Files in " + actDir.getCanonicalPath() ); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - for (File rdfFile : rdfFilesInActDir) { - rdfFiles.put(rdfFile.getName().substring(0, 3), rdfFile); - System.out.println("Found RDF File for PDB ID " + rdfFile.getName().substring(0, 3) ); - System.out.println("Found RDF File " + rdfFile.getName() ); - } - File[] arffFilesInActDir = actDir.listFiles(new ArffFileFilter()); - for (File arffFile : arffFilesInActDir) { - arffFiles.put(arffFile.getName().substring(0, 3), arffFile); - System.out.println("Found RDF File for PDB ID " + arffFile.getName().substring(0, 3) ); - System.out.println("Found ARFF File " + arffFile.getName() ); - } - } - } - - - /* * generate a PdbRdfModel for every pdbID */ + PDBIdRdfModel trainmodel; - - - for (int i = 0; i < trainSet.getTrainset().length; i++) + for (int i = 0; i < proteinSet.getProteinset().size(); i++) { - System.out.println("pdbId: " + trainSet.getTrainset()[i].getPdbID()); - System.out.println("chainID: " + trainSet.getTrainset()[i].getChainID()); - trainmodel.removeAll(); - trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); - - /* - * extract the species the protein originates from - */ - trainSet.getTrainset()[i].setSpecies(getSpecies(trainmodel, trainSet.getTrainset()[i].getPdbID())); - - /* - * as we have sometimes to handle several amino acid chains we need the first - * amino acid of every chain, they are returned within a ResIterator - */ - ResIterator niter = getFirstAA(trainmodel); - - /* - * we add the information of which amino acid is the fourth predecessor of which other amino acid - */ - trainmodel = addDistanceInfo(trainmodel); - - /* - * take all amino acids which are in helices and put them into the - * global positives ArrayList, and all others in the global negatives ArrayList - */ - createPositivesAndNegatives(niter, trainmodel); - String rdfFile; - String arffFile; - if (trainSet.getTrainset()[i].getChainID().length() == 0) - { - rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".rdf"; - arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".arff"; - } - else - { - rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".rdf"; - arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".arff"; - } - String pdbDir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; - File directory = new File(pdbDir); - directory.mkdir(); - String rdfFilePath = pdbDir + rdfFile; - String arffFilePath = pdbDir + arffFile; - - /* - * if arff = true create pdbID.arff files - */ - if (arff) - { - niter = getFirstAA(trainmodel); - createArffFile(arffFilePath, trainmodel, trainSet, niter); - } - - /* - * if rdfConf = true create pdbID.rdf and *.conf files - */ - if(rdfConf) - { - try - { - /* - * creatConfFile() - * writes the conf-Files and saves there File-objects in: - * confFileForAll and confFilePerResidue - */ - createConfFile(pdbDir, rdfFile, trainmodel); - PrintStream out = new PrintStream (new File(rdfFilePath)); - - // Output results - trainmodel.write(out, "RDF/XML"); - - // Important - free up resources used running the query - out.close(); - } - catch (IOException e) - { - System.err.println("OutputStream konnte nicht geschlossen werden!"); - } - } - /* - * remove all triples that contain information about begin and end of helices - */ - - Property beginsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, beginsAt); - Property endsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, endsAt); - Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); - trainmodel = removeStatementsWithObject(trainmodel, residue); - } - - /* - * write out the files that contain information about which proteins originate from which species - */ - HashMap<String, File> proteinsOfSpecies = new HashMap<String, File>(); - for (int i = 0; i < trainSet.getTrainset().length; i++){ - if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()) == null){ - File speciesProteins = new File(dataDir + trainSet.getTrainset()[i].getSpecies() + ".pos"); - proteinsOfSpecies.put(trainSet.getTrainset()[i].getSpecies(), speciesProteins); - } - if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()).canWrite()) { + if (rdfConf || arff) { + + PDBProtein protein = proteinSet.getProteinset().get(i); + String pdbDir = _dataDir + protein.getPdbID() + "/"; + File directory = new File(pdbDir); + if(! directory.exists()) directory.mkdir(); + // + //String arffFilePath = pdbDir + protein.getArffFileName(); + + _logger.info("PDB ID: " + protein.getPdbID()); + _logger.info("chain ID: " + protein.getChainID()); + trainmodel = new PDBIdRdfModel(protein); + + if (fasta){ + trainmodel.createFastaFile(pdbDir); + } + + + /* + * if arff = true create pdbID.arff files + */ + + /* + * as we have sometimes to handle several amino acid chains we need the first + * amino acid of every chain, they are returned within a ResIterator + */ + + if (arff) + { + ResIterator niter = trainmodel.getFirstAA(); + createArffFile(pdbDir, trainmodel, niter); + } + + /* + * remove all triples that contain information about begin and end of helices + */ + Property beginsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + trainmodel.removeStatementsWithPoperty(beginsAt); + Property endsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + trainmodel.removeStatementsWithPoperty(endsAt); + Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); + trainmodel.removeStatementsWithObject(residue); + + /* + * we add the information which amino acid is the fourth predecessor of which other amino acid + */ + trainmodel.addDistanceInfo(); + + /* + * if rdfConf = true create pdbID.rdf and *.conf files + */ + + + + if(rdfConf) + { + String rdfFilePath = pdbDir + protein.getRdfFileName(); + try + { + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ + createConfFile(pdbDir, trainmodel); + + PrintStream out = new PrintStream (new File(rdfFilePath)); + + // Output results + trainmodel.getModel().write(out, "RDF/XML"); + + // Important - free up resources used running the query + out.close(); + } + catch (FileNotFoundException e) + { + _logger.error("File " + rdfFilePath + " konnte nicht gefunden werden!"); + e.printStackTrace(); + } + } + /* + * For every protein source species create a file that contains a list of all + * proteins that originate from that particular species. If it already exists + * we will append to it. + */ + File speciesProteins = new File(_dataDir + protein.getSpecies() + ".pos"); + try { - FileWriter out = new FileWriter(proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()), true); - String line = trainSet.getTrainset()[i].getPdbID() + "." - + trainSet.getTrainset()[i].getChainID() + ".\n"; + String line = protein.getPdbID() + "." + protein.getChainID() + "." + protein.getSpecies() + "\n"; + FileWriter out = new FileWriter(speciesProteins, true); + _logger.debug("Write " + line + "to file " + speciesProteins.getPath() + speciesProteins.getName()); out.write(line); out.close(); } catch (FileNotFoundException e) { - // TODO Auto-generated catch block + _logger.error("Could not find file " + speciesProteins.getPath() + speciesProteins.getName()); e.printStackTrace(); } catch (IOException e) { - // TODO Auto-generated catch block + _logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() + speciesProteins.getName()); e.printStackTrace(); } - } } if(dlLearn) { - - - /* - * load RDF file and perform learn algorithm for every .conf-file - */ - /*ComponentManager cm = ComponentManager.getInstance(); - KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); - cm.applyConfigEntry(ks, "url","file://" + filename ); - ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); - try { - rc.init(); - } catch (ComponentInitException e1) { - e1.printStackTrace(); - } - Start start = null; - Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); - while ( aa.hasNext() ){ - Resource nextRes = aa.next(); - System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); - try{ - start = new Start(confFilePerResidue.get(nextRes)); - } catch (ComponentInitException e) { - e.printStackTrace(); - } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (org.dllearner.confparser.ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - start.start(false); - Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); - System.out.println(d.toKBSyntaxString()); - } - */ + startDlLearner(); } if(wekaLearn) { + startWekaLearner(); } } - private static PdbRdfModel getRdfModelForIds(String pdbID ,String chainID) { - - /* - * i is an Iterator over an XML InputSource - */ - String[] pdbIDs = {pdbID}; - Pdb2RdfInputIterator i = new PdbsIterator(pdbIDs); - PdbXmlParser parser = new PdbXmlParser(); - PdbRdfModel allmodels = new PdbRdfModel(); - try { - while (i.hasNext()) - { - final InputSource input = i.next(); - PdbRdfModel model = parser.parse(input, new PdbRdfModel()); - /* - * jedes Model muss gleich nach den relevanten Daten durchsucht werden, - * da ansonsten Probleme mit der Speichergröße auftreten können. - */ - allmodels.add(getData(model, pdbID, chainID)); - } - } - catch (IOException e) - { - // TODO Auto-generated catch block - e.printStackTrace(); + private static void startDlLearner(){ + HashMap<String, File> pdbIDConfFile = loadConfFiles(_dir); + for (String pdbID : pdbIDConfFile.keySet()){ + try { + new PDBDLLearner(pdbIDConfFile.get(pdbID)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ComponentInitException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (LearningProblemUnsupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } - catch (SAXException e) - { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return allmodels; } - private static PdbRdfModel getData(PdbRdfModel model, String pdbID, String chainID) { - - // Beispiel einer SELECT Abfrage - /* String selectQuery = - * "SELECT { ?x1 ?x2 ?x3 .} " + - * "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> .}"; - * Query query = QueryFactory.create(selectQuery); - * QueryExecution qe = QueryExecutionFactory.create(query, model); - * ResultSet select = qe.execSelect(); - * ResultSetFormatter.out (System.out, select, query); - * - */ - - // CONSTRUCT Abfrage - - PdbRdfModel construct = new PdbRdfModel(); - /* - * i do it kind of difficult, but i want to be certain that i only get the sequences of - * Polypeptides(L) which contain at least one Helix. Furthermore i collect the information - * about at which position helices begin and end. - * NOTE: this information has to be removed before outputing the model. But i will use this - * to check for positive and negative train amino acids - */ - /* - * ich brauche noch die selektion der chain und die info über den genursprungsorganismus - * rdf:resource="http://bio2rdf.org/pdb:3LQH/chain_A" - * http://bio2rdf.org/pdb:3LQH/chain_A/position_1596 - */ - - String queryString = ""; - - if (chainID.length() != 1 || pdbID.length() != 4) - { - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 ." + - " ?xxx rdfs:label ?label .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?xxx rdfs:label ?label FILTER (str(?xxx) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + private static void startWekaLearner() { + HashMap<String, File> pdbIDArffFile = loadArffFiles(_dir); + for (String pdbID: pdbIDArffFile.keySet()){ + try { + new PDBWekaLearner(pdbIDArffFile.get(pdbID)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } - else - { - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 ." + - " ?xxx rdfs:label ?label .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:hasChainPosition> ?x8 ." + - " ?x8 <http://purl.org/dc/terms/isPartOf> <http://bio2rdf.org/pdb:" + pdbID.toUpperCase() + - "/chain_" + chainID.toUpperCase() + "> ." + - " ?xxx rdfs:label ?label FILTER (str(?xxx) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; - } - - //System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - construct.add(qe.execConstruct()); - qe.close(); - return construct; - } - - private static String getSpecies( PdbRdfModel model, String pdbID) { - String queryString ; - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "SELECT ?species " + - "WHERE { ?x1 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x3 ." + - " ?x5 rdfs:label ?species FILTER (str(?x5) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . }"; - - // System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - String species = ""; - try - { - ResultSet results = qe.execSelect() ; - for ( ; results.hasNext() ; ) - { - /* - * every entry in the ResultSet has the same value - */ - QuerySolution soln = results.nextSolution() ; - Literal l = soln.getLiteral("species") ; // Get a result variable - must be a literal - species = l.getString(); + } + private static HashMap<String,File> loadConfFiles (File dir){ + HashMap<String,File> confFiles = new HashMap<String,File>(); + _logger.info("Starting to load files in " + dir ); + File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); + for (File activeDirectory : pdbDir) { + File[] confFilesInActiveDirectory = activeDirectory.listFiles(new ConfFileFilter()); + _logger.info("Looking for Files in " + activeDirectory.getPath() ); + for (File confFile : confFilesInActiveDirectory) { + String confFileName = confFile.getName().substring(0, confFile.getName().indexOf(".conf")); + confFiles.put(confFileName, confFile); + _logger.info("Found .conf File " + confFile.getPath() ); } } - finally - { - qe.close() ; - } - return species; + return confFiles; } - - - private static ResIterator getFirstAA( PdbRdfModel model) { - PdbRdfModel construct = new PdbRdfModel(); - /* i look for all amino acids (AA) that have a successor - * but do not have a predecessor -> it's the first AA of every - * polypeptide chain - */ - - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 pdb:isImmediatelyBefore ?x2 . } " + - "WHERE { ?x1 pdb:isImmediatelyBefore ?x2 . " + - // NOT EXISTS can be used with SPARQL 1.1 - //"NOT EXISTS { ?x3 pdb:isImmediatelyBefore ?x1 . } }"; - " OPTIONAL { ?x3 pdb:isImmediatelyBefore ?x1 . } " + - " FILTER ( !BOUND(?x3) ) }"; - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - construct.add(qe.execConstruct()); - qe.close(); - ResIterator niter = construct.listSubjects(); - return niter; - } - private static PdbRdfModel addDistanceInfo(PdbRdfModel model){ - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 pdb:isFourAminoAcidsBefore ?x5 . } " + - "WHERE { ?x1 pdb:isImmediatelyBefore ?x2 . " + - " ?x2 pdb:isImmediatelyBefore ?x3 . " + - " ?x3 pdb:isImmediatelyBefore ?x4 . " + - " ?x4 pdb:isImmediatelyBefore ?x5 . }"; - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - model.add(qe.execConstruct()); - qe.close(); - return model; - } - - private static void createPositivesAndNegatives(ResIterator riter, PdbRdfModel model) { + private static HashMap<String,File> loadArffFiles (File dir){ + HashMap<String,File> arffFiles = new HashMap<String,File>(); + _logger.info("Starting to load files in " + dir ); - // Properties i have to check for while going through the AA-chain - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - ArrayList<Resource> pos = new ArrayList<Resource>(); - ArrayList<Resource> neg = new ArrayList<Resource>(); - - // every element in riter stands for a AA-chain start - // every first amino acid indicates a new AA-chain - while (riter.hasNext()) { - // Initialization of variables needed - Resource aaOne = riter.nextResource(); - Resource currentaa = aaOne; - Resource nextaa = aaOne; - boolean inHelix = false; - - // look if there is a next AA - do { - // looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property - currentaa = nextaa; - // die Guten ins Töpfchen ... - // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-chain - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ){ - inHelix = true; - } - // die Schlechten ins Kröpfchen - // if we get an non-empty iterator for pdb:endsAt and are already within a AA-chain - // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix){ - inHelix = false; - } - // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()){ - nextaa = model.getProperty(currentaa, iib).getResource(); - } - - // add current amino acid to positives or negatives set - if (inHelix){ - pos.add(currentaa); - } else { - neg.add(currentaa); - } - - } while (currentaa.hasProperty(iib)) ; + File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); + for (File activeDirectory : pdbDir) { + File[] arffFilesInActDir = activeDirectory.listFiles(new ArffFileFilter()); + _logger.info("Looking for .arff Files in " + activeDirectory.getPath()); + for (File arffFile : arffFilesInActDir) { + String arffFileName = arffFile.getName().substring(0, arffFile.getName().indexOf(".arff")); + arffFiles.put(arffFileName, arffFile); + _logger.info("Found .arff File " + arffFile.getPath()); + } } - positives = pos; - negatives = neg; + return arffFiles; } - - - - private static PdbRdfModel removeStatementsWithPoperty(PdbRdfModel model, Property prop){ - - String queryString = - "PREFIX x:<" + prop.getNameSpace() + "> " + - "CONSTRUCT { ?x1 x:" + prop.getLocalName()+ " ?x2 . } " + - "WHERE { ?x1 x:" + prop.getLocalName() + " ?x2 . }"; - //System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - StmtIterator stmtiter = qe.execConstruct().listStatements(); - qe.close(); - while(stmtiter.hasNext()){ - model.remove(stmtiter.next()); - } - - return model; - } - - private static PdbRdfModel removeStatementsWithObject(PdbRdfModel model, Resource res){ - - String queryString = - "PREFIX x:<" + res.getNameSpace() + "> " + - "CONSTRUCT { ?x1 ?x2 x:" + res.getLocalName() + " . } " + - "WHERE { ?x1 ?x2 x:" + res.getLocalName() + " . }"; - // System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - StmtIterator stmtiter = qe.execConstruct().listStatements(); - qe.close(); - while(stmtiter.hasNext()){ - model.remove(stmtiter.next()); - } - - return model; - } - - - private static void createConfFile(String dir, String rdffile, PdbRdfModel model){ + private static void createConfFile(String pdbDir, PDBIdRdfModel model){ try { + PDBProtein protein = model.getProtein(); // the file with all amino acids - String pdbname = dir + "pdb" + ".conf"; - confFileForAll = new File(pdbname); - PrintStream out = new PrintStream (confFileForAll); - // add import statements - out.println("import(\"AA_properties.owl\");"); - out.println("import(\"" + rdffile + "\");"); - out.println(); + String confFilePath = pdbDir + protein.getConfFileName(); + PrintStream confFile = new PrintStream (new File(confFilePath)); + // add import statements to confFile + String importStmt = new String("import(\"../AA_properties.owl\");\n" + + "import(\"" + protein.getRdfFileName() + "\");\n"); + confFile.println(importStmt); - HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); - resConfFiles.put(ala, new File(dir + ala.getLocalName() + ".conf")); - resConfFiles.put(cys, new File(dir + cys.getLocalName() + ".conf")); - resConfFiles.put(asp, new File(dir + asp.getLocalName() + ".conf")); - resConfFiles.put(glu, new File(dir + glu.getLocalName() + ".conf")); - resConfFiles.put(phe, new File(dir + phe.getLocalName() + ".conf")); - resConfFiles.put(gly, new File(dir + gly.getLocalName() + ".conf")); - resConfFiles.put(his, new File(dir + his.getLocalName() + ".conf")); - resConfFiles.put(ile, new File(dir + ile.getLocalName() + ".conf")); - resConfFiles.put(lys, new File(dir + lys.getLocalName() + ".conf")); - resConfFiles.put(leu, new File(dir + leu.getLocalName() + ".conf")); - resConfFiles.put(met, new File(dir + met.getLocalName() + ".conf")); - resConfFiles.put(asn, new File(dir + asn.getLocalName() + ".conf")); - resConfFiles.put(pro, new File(dir + pro.getLocalName() + ".conf")); - resConfFiles.put(gln, new File(dir + gln.getLocalName() + ".conf")); - resConfFiles.put(arg, new File(dir + arg.getLocalName() + ".conf")); - resConfFiles.put(ser, new File(dir + ser.getLocalName() + ".conf")); - resConfFiles.put(thr, new File(dir + thr.getLocalName() + ".conf")); - resConfFiles.put(val, new File(dir + val.getLocalName() + ".conf")); - resConfFiles.put(trp, new File(dir + trp.getLocalName() + ".conf")); - resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + ".conf")); - resConfFiles.put(sel, new File(dir + sel.getLocalName() + ".conf")); - resConfFiles.put(hyt, new File(dir + hyt.getLocalName() + ".conf")); - confFilePerResidue = resConfFiles; + HashMap<Resource, File> confFilePerResidue = AminoAcids.getAllConfFiles(pdbDir, protein.getConfFileName()); + + HashMap<Resource, PrintStream> resprint = AminoAcids.getAminoAcidPrintStreamMap(confFilePerResidue); - - - // put all amino acid resources and the their conf-files together - HashMap<Resource, PrintStream> resprint = new HashMap<Resource, PrintStream>(30); - resprint.put(ala, new PrintStream(resConfFiles.get(ala))); - resprint.put(cys, new PrintStream(resConfFiles.get(cys))); - resprint.put(asp, new PrintStream(resConfFiles.get(asp))); - resprint.put(glu, new PrintStream(resConfFiles.get(glu))); - resprint.put(phe, new PrintStream(resConfFiles.get(phe))); - resprint.put(gly, new PrintStream(resConfFiles.get(gly))); - resprint.put(his, new PrintStream(resConfFiles.get(his))); - resprint.put(ile, new PrintStream(resConfFiles.get(ile))); - resprint.put(lys, new PrintStream(resConfFiles.get(lys))); - resprint.put(leu, new PrintStream(resConfFiles.get(leu))); - resprint.put(met, new PrintStream(resConfFiles.get(met))); - resprint.put(asn, new PrintStream(resConfFiles.get(asn))); - resprint.put(pro, new PrintStream(resConfFiles.get(pro))); - resprint.put(gln, new PrintStream(resConfFiles.get(gln))); - resprint.put(arg, new PrintStream(resConfFiles.get(arg))); - resprint.put(ser, new PrintStream(resConfFiles.get(ser))); - resprint.put(thr, new PrintStream(resConfFiles.get(thr))); - resprint.put(val, new PrintStream(resConfFiles.get(val))); - resprint.put(trp, new PrintStream(resConfFiles.get(trp))); - resprint.put(tyr, new PrintStream(resConfFiles.get(tyr))); - resprint.put(sel, new PrintStream(resConfFiles.get(sel))); - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - // add import statements to .conf files for amino acids + // add import statements to <PDB ID>_<Amino Acid>.conf files Iterator<Resource> keys = resprint.keySet().iterator(); - while (keys.hasNext()){ - Resource k = keys.next(); - resprint.get(k).println("import(\"AA_properties.owl\");"); - resprint.get(k).println("import(\"" + rdffile + "\");"); - resprint.get(k).println(); + resprint.get(keys.next()).println(importStmt); } - - /* - * the for-loops beneath may cause trouble, if there exists an amino acid within a structure that - * doesn't exists in our HashMap - */ + + // add every amino acid in positive list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf + ArrayList<Resource> positives = model.getPositives(); for (int i = 0 ; i < positives.size() ; i++ ) { - out.println("+\"" + positives.get(i).getURI() + "\""); + confFile.println("+\"" + positives.get(i).getURI() + "\""); try{ - Statement spo = model.getProperty(positives.get(i), type); + Statement spo = model.getModel().getProperty(positives.get(i), type); resprint.get(spo.getResource()).println("+\"" + positives.get(i).getURI() + "\""); } catch (NullPointerException e) { // What was the Object that probably caused the pain? - System.err.println("Object probably not in our HashMap: " + - model.getProperty(positives.get(i), type).getResource()); - e.getStackTrace(); + _logger.error("Object probably not in our HashMap: " + + model.getModel().getProperty(positives.get(i), type).getResource()); + e.printStackTrace(); } - // System.out.println("Couldn't find AA: " + positives.get(i).getURI()); - } + // add every amino acid in negative list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf + ArrayList<Resource> negatives = model.getNegatives(); for (int i = 0 ; i < negatives.size() ; i++ ) { - out.println("-\"" + negatives.get(i).getURI() + "\""); + confFile.println("-\"" + negatives.get(i).getURI() + "\""); try{ - Statement spo = model.getProperty(negatives.get(i), type); + Statement spo = model.getModel().getProperty(negatives.get(i), type); resprint.get(spo.getResource()).println("-\"" + negatives.get(i).getURI() + "\""); } catch (NullPointerException e) { // What was the Object that probably caused the pain? - System.err.println("Object probably not in our HashMap: " + - model.getProperty(negatives.get(i), type).getResource()); - e.getStackTrace(); + _logger.error("Object probably not in our HashMap: " + + model.getModel().getProperty(negatives.get(i), type).getResource()); + e.printStackTrace(); } - - // System.out.println("Couldn't find AA: " + positives.get(i).getURI()); - } // Important - free up resources used running the query - out.close(); + confFile.close(); Iterator<Resource> newkeys = resprint.keySet().iterator(); while ( newkeys.hasNext() ){ @@ -805,143 +393,103 @@ } catch (IOException e) { - System.err.println("OutputStream konnte nicht geschlossen werden!"); + _logger.error("OutputStream konnte nicht geschlossen werden!"); } } - private static void createArffFile(String arffFilePath, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - - String relation = "@RELATION "; - for (int i = 0; i < sets.getTrainset().length ; i++){ - System.out.println("Element " + i + " = " + sets.getTrainset()[i].getPdbID()); - relation += sets.getTrainset()[i]; - } - - /* - * ATTRIBUTES - */ - - // Integer declaring Position in chain - String attrPosInChain = "@ATTRIBUTE position_in_chain NUMERIC\n"; - // Helix = 1 Other = 0 - String attrHelix = "@ATTRIBUTE in_helix NUMERIC\n"; - // Hydrophilic = 0 Hydrophobic = 1 Very_hydrophobic = 2 - String attrHydrophob = "@ATTRIBUTE hydrophob NUMERIC\n"; - // Negative = -1 Neutral = 0 Positive = 1 - String attrCharge = "@ATTRIBUTE charge NUMERIC\n"; - // Large = 2 Small = 1 Tiny = 0.5 - String attrSize = "@ATTRIBUTE size NUMERIC\n"; - // Aliphatic = 0 Aromatic = 1 - String attrAromaticity = "@ATTRIBUTE aromaticity NUMERIC\n"; - // Donor = 1 Donor/Acceptor = 0 Acceptor = -1 - String attrHydrogenbonding = "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; - - String attribute = attrPosInChain + attrHelix + attrHydrophob + - attrCharge + attrSize + attrAromaticity + attrHydrogenbonding + "\n"; - - String data = "@DATA\n"; - - HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); - resdata.put(ala, new String("2,0,0.5,?,?\n")); - resdata.put(cys, new String("1,0,1,?,0\n")); - resdata.put(asp, new String("0,-1,1,?,-1\n")); - resdata.put(glu, new String("0,-1,2,?,-1\n")); - resdata.put(phe, new String("2,0,2,1,?\n")); - resdata.put(gly, new String("2,0,0.5,?,?\n")); - resdata.put(his, new String("1,1,2,1,1\n")); - resdata.put(ile, new String("2,0,2,0,?\n")); - resdata.put(lys, new String("1,1,2,?,1\n")); - resdata.put(leu, new String("2,0,2,0,?\n")); - resdata.put(met, new String("2,0,2,?,?\n")); - resdata.put(asn, new String("0,0,1,?,0\n")); - resdata.put(pro, new String("?,0,1,?,?\n")); - resdata.put(gln, new String("0,0,2,?,0\n")); - resdata.put(arg, new String("0,1,2,?,1\n")); - resdata.put(ser, new String("0,0,0.5,?,0\n")); - resdata.put(thr, new String("1,0,1,?,0,\n")); - resdata.put(val, new String("2,0,1,0,?\n")); - resdata.put(trp, new String("1,0,2,1,1\n")); - resdata.put(tyr, new String("1,0,2,1,0\n")); - resdata.put(sel, new String("?,?,?,?,\n")); - - - // Properties i have to check for while going through the AA-chain - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - - // every element in riter stands for a AA-chain start - // every first amino acid indicates a new AA-chain - while (riter.hasNext()) - { - // Initialization of variables needed - int i = 0; - Resource aaOne = riter.nextResource(); - Resource currentaa = aaOne; - Resource nextaa = aaOne; - boolean inHelix = false; - System.out.println(currentaa.getURI()); - // look if there is a next AA - do { - ++i; - System.out.print(i + " "); - //looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property - currentaa = nextaa; - NodeIterator resType = model.listObjectsOfProperty(currentaa,type); + private static void createArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + + try { + PDBProtein protein = model.getProtein(); + String arffFilePath = pdbDir + protein.getArffFileName(); + PrintStream out = new PrintStream (arffFilePath); + _logger.debug("Creating ARFF file: " + arffFilePath); + + /* + * RELATION + */ + String relation = "@RELATION " + protein.getPdbID(); + out.println(relation); + _logger.debug(relation); + + /* + * ATTRIBUTES + */ + // Integer declaring Position in chain + String attributes = "@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 + "@ATTRIBUTE charge NUMERIC\n" + // Negative = -1; Neutral = 0; Positive = 1 + "@ATTRIBUTE size NUMERIC\n" + // Large = 2; Small = 1; Tiny = 0.5 + "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 + "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 + + for (int i = -8; i < 8; i++) { + attributes += "@ATTRIBUTE aa_position_" + i + " CLASS\n"; // amino acid at position $i from current amino acid + } + attributes += "@ATTRIBUTE in_helix NUMERIC\n"; // Helix = 1 Other = 0 + + _logger.debug(attributes); + out.println(attributes); + + /* + * @DATA + */ + String data = "@DATA\n"; + _logger.debug(data); + out.println(data); + + // HashMap containing information about the properties of every amino acid + HashMap<Resource, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + ArrayList<Resource> positives = model.getPositives(); + ArrayList<Resource> negatives = model.getNegatives(); + Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); + Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); + + + while (firstAAs.hasNext()){ + Resource firstAA = firstAAs.next(); + Resource currentAA = firstAA; + Resource nextAA = firstAA; - // die Guten ins Töpfchen ... - // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-helix - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ) - { - inHelix = true; - } - // die Schlechten ins Kröpfchen - // if we get an non-empty iterator for pdb:endsAt and are already within a AA-helix - // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix) - { - inHelix = false; - } - // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()) - { - nextaa = model.getProperty(currentaa, iib).getResource(); - } - - // add current amino acid to positives or negatives set - while(resType.hasNext()) - { - Resource aaType = resType.next().asResource(); - System.out.println(aaType.getURI()); - if (resdata.get(aaType) != null) - { - if (inHelix) - { - data += i + "," + 1 + "," + resdata.get(aaType); + int i = 0; + String dataLine; + do { + dataLine = ""; + currentAA = nextAA; + + NodeIterator niter = model.getModel().listObjectsOfProperty(currentAA, type); + while (niter.hasNext()){ + Resource key = niter.next().asResource(); + if (resdata.containsKey(key)){ + dataLine += resdata.get(key) +","; } - else - { - data += i + "," + 0 + "," + resdata.get(aaType); + } + + for (int j = (i - 8... [truncated message content] |
From: <km...@us...> - 2011-10-26 14:34:16
|
Revision: 3327 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3327&view=rev Author: kmpf Date: 2011-10-26 14:34:10 +0000 (Wed, 26 Oct 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-26 13:48:52 UTC (rev 3326) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-26 14:34:10 UTC (rev 3327) @@ -415,8 +415,8 @@ "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 - for (int i = -8; i < 8; i++) { - attributes += "@ATTRIBUTE aa_position_" + i + " CLASS\n"; // amino acid at position $i from current amino acid + for (int i = -8; i <= 8; i++) { + attributes += "@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"; // amino acid at position $i from current amino acid } attributes += "@ATTRIBUTE in_helix NUMERIC\n"; // Helix = 1 Other = 0 Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java 2011-10-26 13:48:52 UTC (rev 3326) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java 2011-10-26 14:34:10 UTC (rev 3327) @@ -3,7 +3,6 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.util.List; import java.util.Random; import org.apache.log4j.ConsoleAppender; @@ -11,18 +10,19 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.SimpleLayout; - +/* import weka.classifiers.Evaluation; -import weka.classifiers.trees.J48; +import weka.classifiers.bayes.NaiveBayes; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; +*/ public class PDBWekaLearner { private static Logger logger = Logger.getRootLogger(); public PDBWekaLearner (File arffFile) throws IOException{ - + /* // create logger (configure this to your needs) SimpleLayout layout = new SimpleLayout(); FileAppender fileAppender = new FileAppender(layout, "log/sample_log.txt", false); @@ -42,13 +42,13 @@ if (data.classIndex() == -1) data.setClassIndex(data.numAttributes() - 1); - String[] options = new String[1]; - options[0] = "-U"; // unpruned tree - J48 tree = new J48(); // new instance of tree - tree.setOptions(options); // set the options - //tree.buildClassifier(data); // build classifier + // String[] options = new String[0]; + + NaiveBayes classifier = new NaiveBayes(); // new instance of tree + // classifier.setOptions(options); // set the options + classifier.buildClassifier(data); // build classifier Evaluation eval = new Evaluation(data); - eval.crossValidateModel(tree, data, 10, new Random(1)); + eval.crossValidateModel(classifier, data, 10, new Random(1)); // gather the results of the evaluation process String resultsFileName = arffFile.getPath().replace(".arff", ".weka.res"); @@ -70,5 +70,6 @@ } catch (Exception e){ e.printStackTrace(); } + */ } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-11-03 17:28:21
|
Revision: 3370 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3370&view=rev Author: kmpf Date: 2011-11-03 17:28:10 +0000 (Thu, 03 Nov 2011) Log Message: ----------- Changed *.conf format. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-03 15:02:37 UTC (rev 3369) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-03 17:28:10 UTC (rev 3370) @@ -90,6 +90,33 @@ return resprint; } + public static HashMap<Resource, StringBuffer> getAminoAcidStringBufferMap(String init){ + // put all amino acid resources and the their conf-files together + HashMap<Resource, StringBuffer> resourceString = new HashMap<Resource, StringBuffer>(30); + resourceString.put(ALA, new StringBuffer(init)); + resourceString.put(CYS, new StringBuffer(init)); + resourceString.put(ASP, new StringBuffer(init)); + resourceString.put(GLU, new StringBuffer(init)); + resourceString.put(PHE, new StringBuffer(init)); + resourceString.put(GLY, new StringBuffer(init)); + resourceString.put(HIS, new StringBuffer(init)); + resourceString.put(ILE, new StringBuffer(init)); + resourceString.put(LYS, new StringBuffer(init)); + resourceString.put(LEU, new StringBuffer(init)); + resourceString.put(MET, new StringBuffer(init)); + resourceString.put(ASN, new StringBuffer(init)); + resourceString.put(PRO, new StringBuffer(init)); + resourceString.put(GLN, new StringBuffer(init)); + resourceString.put(ARG, new StringBuffer(init)); + resourceString.put(SER, new StringBuffer(init)); + resourceString.put(THR, new StringBuffer(init)); + resourceString.put(VAL, new StringBuffer(init)); + resourceString.put(TRP, new StringBuffer(init)); + resourceString.put(TYR, new StringBuffer(init)); + resourceString.put(SEL, new StringBuffer(init)); + return resourceString; + } + public static HashMap<Resource, String> getAminoAcidArffAttributeMap(){ HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); resdata.put(ALA, new String("2,0,0.5,?,?")); Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-03 15:02:37 UTC (rev 3369) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-03 17:28:10 UTC (rev 3370) @@ -101,16 +101,16 @@ * load = true -> load alle .rdf, .conf and .arff Files that can be found within the directory dataDir * load = false -> don't load anything */ - Boolean dlLearn = true; - Boolean wekaLearn = true; + Boolean dlLearn = false; + Boolean wekaLearn = false; int dataSet = 5; /* * data for test purpose */ -// PdbProtein testProtein = new PdbProtein("1XFF"); - PDBProtein testProtein = new PDBProtein("1XFF", "A"); + PDBProtein testProtein = new PDBProtein("1XFF"); +// PDBProtein testProtein = new PDBProtein("1LMB", "3"); /* * create a training data set @@ -233,14 +233,14 @@ try { String line = protein.getPdbID() + "." + protein.getChainID() + "." + protein.getSpecies() + "\n"; FileWriter out = new FileWriter(speciesProteins, true); - _logger.debug("Write " + line + "to file " + speciesProteins.getPath() + speciesProteins.getName()); + _logger.debug("Write " + line + "to file " + speciesProteins.getPath()); out.write(line); out.close(); } catch (FileNotFoundException e) { - _logger.error("Could not find file " + speciesProteins.getPath() + speciesProteins.getName()); + _logger.error("Could not find file " + speciesProteins.getPath() ); e.printStackTrace(); } catch (IOException e) { - _logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() + speciesProteins.getName()); + _logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() ); e.printStackTrace(); } } @@ -325,33 +325,53 @@ try { PDBProtein protein = model.getProtein(); - // the file with all amino acids + + // the .conf file that contains all positives and negatives String confFilePath = pdbDir + protein.getConfFileName(); PrintStream confFile = new PrintStream (new File(confFilePath)); - // add import statements to confFile - String importStmt = new String("import(\"../AA_properties.owl\");\n" + - "import(\"" + protein.getRdfFileName() + "\");\n"); - confFile.println(importStmt); - - HashMap<Resource, File> confFilePerResidue = AminoAcids.getAllConfFiles(pdbDir, protein.getConfFileName()); - HashMap<Resource, PrintStream> resprint = AminoAcids.getAminoAcidPrintStreamMap(confFilePerResidue); - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); + // knowledge source definition + String ks = new String ("// knowledge source definition\n" + + "ks.type = \"OWL File\"\n" + + "ks.fileName = \"AA_properties.owl\"\n" + + "\n" + + "ks.type = \"OWL File\"\n" + + "ks.fileName = \"" + protein.getRdfFileName() + "\"\n"); + // learning problem + StringBuffer lp = new StringBuffer ("// learning problem\n" + + "lp.type = \"posNegStandard\"" + + "\n" + + "lp.positiveExamples = { "); - // add import statements to <PDB ID>_<Amino Acid>.conf files - Iterator<Resource> keys = resprint.keySet().iterator(); - while (keys.hasNext()){ - resprint.get(keys.next()).println(importStmt); + // separate files that contain only positives and negatives of a distinct type + HashMap<Resource, File> confFilePerResidue = + AminoAcids.getAllConfFiles(pdbDir, protein.getConfFileName()); + HashMap<Resource, PrintStream> resprint = + AminoAcids.getAminoAcidPrintStreamMap(confFilePerResidue); + HashMap<Resource, StringBuffer> resourceStringBuffer = + AminoAcids.getAminoAcidStringBufferMap(lp.toString()); + + // add knowledge source definition to <PDB ID>.conf files + confFile.println(ks); + + // add knowledge source definition to <PDB ID>_<Amino Acid>.conf files + Iterator<Resource> resources = resprint.keySet().iterator(); + while (resources.hasNext()){ + resprint.get(resources.next()).println(ks); } - // add every amino acid in positive list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf + ArrayList<Resource> positives = model.getPositives(); + + Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); + + // add positive examples to <PDB ID>.conf and corresponding <PDB ID>_<Amino Acid>.conf files for (int i = 0 ; i < positives.size() ; i++ ) { - confFile.println("+\"" + positives.get(i).getURI() + "\""); + lp.append("\"" + positives.get(i).getURI() + "\", "); try{ Statement spo = model.getModel().getProperty(positives.get(i), type); - resprint.get(spo.getResource()).println("+\"" + positives.get(i).getURI() + "\""); + resourceStringBuffer.get(spo.getResource()).append("\"" + positives.get(i).getURI() + "\", "); } catch (NullPointerException e) { // What was the Object that probably caused the pain? _logger.error("Object probably not in our HashMap: " + @@ -360,13 +380,29 @@ } } + if (lp.toString().contains(",")) + lp.deleteCharAt(lp.lastIndexOf(",")); + lp.append("}\n" + + "lp.negativeExamples = { "); + + resources = resourceStringBuffer.keySet().iterator(); + while (resources.hasNext()){ + Resource residue = resources.next(); + if (resourceStringBuffer.get(residue).toString().contains(",")) + resourceStringBuffer.get(residue).deleteCharAt( + resourceStringBuffer.get(residue).lastIndexOf(",")); + resourceStringBuffer.get(residue).append("}\n" + + "lp.negativeExamples = { "); + } + + // add every amino acid in negative list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf ArrayList<Resource> negatives = model.getNegatives(); for (int i = 0 ; i < negatives.size() ; i++ ) { - confFile.println("-\"" + negatives.get(i).getURI() + "\""); + lp.append("\"" + negatives.get(i).getURI() + "\", "); try{ Statement spo = model.getModel().getProperty(negatives.get(i), type); - resprint.get(spo.getResource()).println("-\"" + negatives.get(i).getURI() + "\""); + resourceStringBuffer.get(spo.getResource()).append("\"" + negatives.get(i).getURI() + "\", "); } catch (NullPointerException e) { // What was the Object that probably caused the pain? _logger.error("Object probably not in our HashMap: " + @@ -374,7 +410,26 @@ e.printStackTrace(); } } + + // add learning problem to <PDB ID>.conf file + // and write learning problem to file + if (lp.toString().contains(",")) + lp.deleteCharAt(lp.lastIndexOf(",")); + lp.append("}\n"); + confFile.println(lp); + // add learning problem to <PDB ID>_<Amino Acid>.conf files + // and write learning problem to file + resources = resourceStringBuffer.keySet().iterator(); + while (resources.hasNext()){ + Resource residue = resources.next(); + if (resourceStringBuffer.get(residue).toString().contains(",")) + resourceStringBuffer.get(residue).deleteCharAt( + resourceStringBuffer.get(residue).lastIndexOf(",")); + resourceStringBuffer.get(residue).append("}\n"); + resprint.get(residue).println(resourceStringBuffer.get(residue)); + } + // Important - free up resources used running the query confFile.close(); @@ -382,7 +437,6 @@ while ( newkeys.hasNext() ){ resprint.get(newkeys.next()).close(); } - } catch (IOException e) { @@ -409,16 +463,16 @@ * ATTRIBUTES */ // Integer declaring Position in chain - String attributes = "@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 + StringBuffer attributes = new StringBuffer("@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 "@ATTRIBUTE charge NUMERIC\n" + // Negative = -1; Neutral = 0; Positive = 1 "@ATTRIBUTE size NUMERIC\n" + // Large = 2; Small = 1; Tiny = 0.5 "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 - "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 + "@ATTRIBUTE hydrogen_bonding NUMERIC\n"); // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 for (int i = -8; i <= 8; i++) { - attributes += "@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"; // amino acid at position $i from current amino acid + attributes.append("@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"); // amino acid at position $i from current amino acid } - attributes += "@ATTRIBUTE in_helix NUMERIC\n"; // Helix = 1 Other = 0 + attributes.append("@ATTRIBUTE in_helix NUMERIC\n"); // Helix = 1 Other = 0 _logger.debug(attributes); out.println(attributes); @@ -443,32 +497,32 @@ Resource currentAA = firstAA; Resource nextAA = firstAA; - int i = 0; - String dataLine; - do { - dataLine = ""; + + + for ( int i = 0; currentAA.hasProperty(iib); i++ ) { + StringBuffer dataLine = new StringBuffer(""); currentAA = nextAA; NodeIterator niter = model.getModel().listObjectsOfProperty(currentAA, type); while (niter.hasNext()){ Resource key = niter.next().asResource(); if (resdata.containsKey(key)){ - dataLine += resdata.get(key) +","; + dataLine.append( resdata.get(key) + "," ); } } for (int j = (i - 8); j <= (i + 8) ; j++){ try { - dataLine += protein.getSequence().charAt(j) + ","; + dataLine.append( protein.getSequence().charAt(j) + "," ); } catch (IndexOutOfBoundsException e) { - dataLine += "?,"; + dataLine.append( "?," ); } } if (positives.contains(currentAA)){ - dataLine += "1"; + dataLine.append( "1" ); } else if (negatives.contains(currentAA)){ - dataLine += "0"; + dataLine.append( "0" ); } @@ -479,8 +533,7 @@ } _logger.info(dataLine); out.println(dataLine); - i++; - } while (currentAA.hasProperty(iib)) ; + } } } catch (FileNotFoundException e){ e.printStackTrace(); Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-03 15:02:37 UTC (rev 3369) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-03 17:28:10 UTC (rev 3370) @@ -37,7 +37,9 @@ this._protein = protein; this._pdbIdModel = this.getPdbRdfModel(); this.getProtein().setSequence(extractSequence(_pdbIdModel)); + System.out.println("Sequence: " + this.getProtein().getSequence()); this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); + System.out.println("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); } @@ -98,11 +100,12 @@ "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT {<http://bio2rdf.org/pdb:" + this.getProtein().getPdbID() + "/extraction/source/gene/organism> rdfs:label ?species. }" + - "WHERE { ?x1 <http://purl.org/dc/terms/isPartOf> ?x2 ." + - " ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x3 ." + - " ?x1 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x4 ." + - " ?x5 rdfs:label ?species FILTER (str(?x5) = fn:concat(str(?x2), '/extraction/source/gene/organism')) . }"; + "CONSTRUCT { pdb:" + this.getProtein().getPdbID() + "/extraction/source/gene/organism rdfs:label ?species. }" + + "WHERE { ?x1 dcterms:isPartOf ?x2 ." + + " ?x1 rdf:type> ?x3 ." + + " ?x1 pdb:isImmediatelyBefore ?x4 ." + + " ?x5 rdfs:label ?species " + + " FILTER (str(?x5) = fn:concat(str(?x2), '/extraction/source/gene/organism')) . }"; // System.out.println(queryString); @@ -189,36 +192,37 @@ "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 ." + + "CONSTRUCT { ?x1 pdb:beginsAt ?x2 ." + + " ?x1 pdb:endsAt ?x3 . " + + " ?x5 dcterms:isPartOf ?x4 . " + + " ?x5 rdf:type ?x6 ." + + " ?x5 pdb:isImmediatelyBefore ?x7 ." + " ?organism rdfs:label ?organismName ." + - " ?seq <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:PolymerSequence> ." + + " ?seq rdf:type pdb:PolymerSequence ." + " ?seq pdb:hasValue ?sequence. } " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ."; + "WHERE { ?x1 rdf:type pdb:Helix ." + + " ?x1 pdb:beginsAt ?x2 ." + + " ?x1 pdb:endsAt ?x3 ." + + " ?x3 dcterms:isPartOf ?x4 ." + + " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 dcterms:isPartOf ?x4 ." + + " ?x5 rdf:type ?x6 ."; if (chainID.length() == 1 && pdbID.length() == 4) { queryString += - " ?x5 <http://bio2rdf.org/pdb:hasChainPosition> ?x8 ." + - " ?x8 <http://purl.org/dc/terms/isPartOf> <http://bio2rdf.org/pdb:" + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 dcterms:isPartOf pdb:" + pdbID.toUpperCase() + "/chain_" + chainID.toUpperCase() + "> ."; } queryString += - " ?organism rdfs:label ?organismName FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + - " ?seq <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:PolymerSequence> . " + + " ?organism rdfs:label ?organismName " + + "FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + + " ?seq rdf:type pdb:PolymerSequence . " + " ?seq pdb:hasValue ?sequence ." + // with the Optional clause i get the information by which amino acid // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } .}"; System.out.println(queryString); Query query = QueryFactory.create(queryString); Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java 2011-11-03 15:02:37 UTC (rev 3369) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java 2011-11-03 17:28:10 UTC (rev 3370) @@ -1,5 +1,5 @@ package org.dllearner.examples.pdb; - +/* import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -10,7 +10,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.SimpleLayout; -/* + import weka.classifiers.Evaluation; import weka.classifiers.bayes.NaiveBayes; import weka.core.Instances; @@ -22,7 +22,7 @@ private static Logger logger = Logger.getRootLogger(); public PDBWekaLearner (File arffFile) throws IOException{ - /* + /* // create logger (configure this to your needs) SimpleLayout layout = new SimpleLayout(); FileAppender fileAppender = new FileAppender(layout, "log/sample_log.txt", false); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-11-11 15:05:35
|
Revision: 3396 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3396&view=rev Author: kmpf Date: 2011-11-11 15:05:29 +0000 (Fri, 11 Nov 2011) Log Message: ----------- Untested workaround to get the positive/negative class mapping correct. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -117,29 +117,29 @@ return resourceString; } - public static HashMap<Resource, String> getAminoAcidArffAttributeMap(){ - HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); - resdata.put(ALA, new String("2,0,0.5,?,?")); - resdata.put(CYS, new String("1,0,1,?,0")); - resdata.put(ASP, new String("0,-1,1,?,-1")); - resdata.put(GLU, new String("0,-1,2,?,-1")); - resdata.put(PHE, new String("2,0,2,1,?")); - resdata.put(GLY, new String("2,0,0.5,?,?")); - resdata.put(HIS, new String("1,1,2,1,1")); - resdata.put(ILE, new String("2,0,2,0,?")); - resdata.put(LYS, new String("1,1,2,?,1")); - resdata.put(LEU, new String("2,0,2,0,?")); - resdata.put(MET, new String("2,0,2,?,?")); - resdata.put(ASN, new String("0,0,1,?,0")); - resdata.put(PRO, new String("?,0,1,?,?")); - resdata.put(GLN, new String("0,0,2,?,0")); - resdata.put(ARG, new String("0,1,2,?,1")); - resdata.put(SER, new String("0,0,0.5,?,0")); - resdata.put(THR, new String("1,0,1,?,0,")); - resdata.put(VAL, new String("2,0,1,0,?")); - resdata.put(TRP, new String("1,0,2,1,1")); - resdata.put(TYR, new String("1,0,2,1,0")); - resdata.put(SEL, new String("?,?,?,?,?")); + public static HashMap<String, String> getAminoAcidArffAttributeMap(){ + HashMap<String, String> resdata = new HashMap<String, String>(30); + resdata.put(new String("A"), new String("2,0,0.5,?,?")); + resdata.put(new String("C"), new String("1,0,1,?,0")); + resdata.put(new String("D"), new String("0,-1,1,?,-1")); + resdata.put(new String("E"), new String("0,-1,2,?,-1")); + resdata.put(new String("F"), new String("2,0,2,1,?")); + resdata.put(new String("G"), new String("2,0,0.5,?,?")); + resdata.put(new String("H"), new String("1,1,2,1,1")); + resdata.put(new String("I"), new String("2,0,2,0,?")); + resdata.put(new String("K"), new String("1,1,2,?,1")); + resdata.put(new String("L"), new String("2,0,2,0,?")); + resdata.put(new String("M"), new String("2,0,2,?,?")); + resdata.put(new String("N"), new String("0,0,1,?,0")); + resdata.put(new String("P"), new String("?,0,1,?,?")); + resdata.put(new String("Q"), new String("0,0,2,?,0")); + resdata.put(new String("R"), new String("0,1,2,?,1")); + resdata.put(new String("S"), new String("0,0,0.5,?,0")); + resdata.put(new String("T"), new String("1,0,1,?,0,")); + resdata.put(new String("V"), new String("2,0,1,0,?")); + resdata.put(new String("W"), new String("1,0,2,1,1")); + resdata.put(new String("Y"), new String("1,0,2,1,0")); + resdata.put(new String("U"), new String("?,?,?,?,?")); return resdata; } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -156,6 +156,7 @@ _logger.info("PDB ID: " + protein.getPdbID()); _logger.info("chain ID: " + protein.getChainID()); + trainmodel = new PDBIdRdfModel(protein); if (fasta){ @@ -491,13 +492,55 @@ out.println(data); // HashMap containing information about the properties of every amino acid - HashMap<Resource, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + HashMap<String, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); ArrayList<Resource> positives = model.getPositives(); ArrayList<Resource> negatives = model.getNegatives(); Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - + String sequence = protein.getSequence(); + HashMap<Integer, Resource> posRes = model.getPositionResource(); + for ( int i = 0; i < sequence.length(); i++) { + StringBuffer dataLine = new StringBuffer(""); + String key = Character.toString( sequence.charAt(i) ); + + // add amino acid description to dataLine + if ( resdata.containsKey(key) ){ + dataLine.append( resdata.get(key) + "," ); + } else { + // + dataLine.append( resdata.get("U") + "," ); + } + + // add information about neighbouring amino acids to dataLine + for (int j = (i - 8); j <= (i + 8) ; j++){ + try { + dataLine.append( protein.getSequence().charAt(j) + "," ); + } catch (IndexOutOfBoundsException e) { + dataLine.append( "?," ); + } + } + + // add information about positive or negative to dataLine + if (positives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "1" ); + } else if (negatives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "0" ); + } else { + dataLine.append( "?" ); + } + + _logger.info(dataLine); + out.println(dataLine); + + } + + } catch (FileNotFoundException e){ + e.printStackTrace(); + } + + +/* // to be exchanged while (firstAAs.hasNext()){ Resource firstAA = firstAAs.next(); Resource currentAA = firstAA; @@ -529,6 +572,8 @@ dataLine.append( "1" ); } else if (negatives.contains(currentAA)){ dataLine.append( "0" ); + } else { + dataLine.append( "?" ); } @@ -537,12 +582,6 @@ if (model.getModel().contains(currentAA, iib)){ nextAA = model.getModel().getProperty(currentAA, iib).getResource(); } - _logger.info(dataLine); - out.println(dataLine); - } - } - } catch (FileNotFoundException e){ - e.printStackTrace(); - } +*/ } } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -5,7 +5,13 @@ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; +import java.util.HashMap; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; import org.xml.sax.InputSource; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; @@ -27,13 +33,17 @@ public class PDBIdRdfModel { + private static Logger _logger = Logger.getRootLogger(); + private PdbRdfModel _pdbIdModel = new PdbRdfModel(); private PdbRdfModel _removedFromModel = new PdbRdfModel(); private PDBProtein _protein = null ; private ArrayList<Resource> _positives = null; private ArrayList<Resource> _negatives = null; + private HashMap<Integer, Resource> _positionResource = null; - public PDBIdRdfModel (PDBProtein protein){ + public PDBIdRdfModel (PDBProtein protein) { + this._protein = protein; this._pdbIdModel = this.getPdbRdfModel(); this.getProtein().setSequence(extractSequence(_pdbIdModel)); @@ -41,6 +51,7 @@ this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); System.out.println("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); + _positionResource = createPositionResidueMap(); } public PdbRdfModel getModel(){ @@ -58,6 +69,10 @@ public ArrayList<Resource> getNegatives(){ return _negatives; } + + public HashMap<Integer, Resource> getPositionResource(){ + return _positionResource; + } private PdbRdfModel getPdbRdfModel() { String[] pdbIDs = {_protein.getPdbID()}; @@ -192,13 +207,15 @@ "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + "CONSTRUCT { ?x1 pdb:beginsAt ?x2 ." + - " ?x1 pdb:endsAt ?x3 . " + - " ?x5 dcterms:isPartOf ?x4 . " + + " ?x1 pdb:endsAt ?x3 ." + + " ?x5 dcterms:isPartOf ?x4 ." + " ?x5 rdf:type ?x6 ." + " ?x5 pdb:isImmediatelyBefore ?x7 ." + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 rdfs:label ?residuePosition ." + " ?organism rdfs:label ?organismName ." + " ?seq rdf:type pdb:PolymerSequence ." + - " ?seq pdb:hasValue ?sequence. } " + + " ?seq pdb:hasValue ?sequence . } " + "WHERE { ?x1 rdf:type pdb:Helix ." + " ?x1 pdb:beginsAt ?x2 ." + " ?x1 pdb:endsAt ?x3 ." + @@ -206,7 +223,9 @@ " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + " ?x5 dcterms:isPartOf ?x4 ." + " ?x5 rdf:type ?x6 ." + - " ?x5 pdb:hasChainPosition ?x8 . " + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 dcterms:isPartOf ?x4 ." + + " ?x8 rdfs:label ?residuePosition ." + " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . "; if (chainID.length() == 1 && pdbID.length() == 4) @@ -256,6 +275,59 @@ return niter; } + private HashMap<Integer, Resource> createPositionResidueMap(){ + + HashMap<Integer, Resource> posres = new HashMap<Integer, Resource>(150); + Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); + + ResIterator firstAAs = this.getFirstAA(); + while ( firstAAs.hasNext()){ + Resource firstAA = firstAAs.next(); + Resource nextAA = firstAA; + Resource currentAA = firstAA; + do { + currentAA = nextAA; + posres.put(new Integer(this.getResiduePosition(currentAA)), currentAA); + nextAA = _pdbIdModel.getProperty(currentAA, iib).getResource(); + } while (currentAA.hasProperty(iib)); + } + + return posres; + } + + private int getResiduePosition(Resource res) { + Property hasChainPosition = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "hasChainPosition"); + Property label = ResourceFactory.createProperty("http://www.w3.org/2000/01/rdf-schema#", "label"); + ResourceFactory.createResource(); + + NodeIterator residuePosition = _pdbIdModel.listObjectsOfProperty(res, hasChainPosition ); + ArrayList<RDFNode> positionNodes = new ArrayList<RDFNode>(); + ArrayList<String> positionLabels = new ArrayList<String>(); + while ( residuePosition.hasNext() ) { + RDFNode positionNode = residuePosition.next(); + positionNodes.add(positionNode); + NodeIterator positionLabelNodes = _pdbIdModel.listObjectsOfProperty( positionNode.asResource(), label ); + while ( positionLabelNodes.hasNext() ) { + positionLabels.add(positionLabelNodes.next().toString()); + } + + } + + + Integer position = null; + if ( positionNodes.size() == 1 && positionLabels.size() == 1 ) { + String positionLabel = positionLabels.get(0); + String a = new String( "Position " ); + String b = new String( " on chain" ); + position = Integer.parseInt( + positionLabel.substring(positionLabel.indexOf(a) + a.length(), positionLabel.indexOf(b))); + } else { + position = new Integer(0); + _logger.error(""); + } + return position.intValue(); + } + public void addDistanceInfo(){ String queryString = "PREFIX pdb: <http://bio2rdf.org/pdb:> " + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-11-23 18:49:18
|
Revision: 3433 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3433&view=rev Author: kmpf Date: 2011-11-23 18:49:11 +0000 (Wed, 23 Nov 2011) Log Message: ----------- Amino Acid Properties changed Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -117,32 +117,122 @@ return resourceString; } - public static HashMap<String, String> getAminoAcidArffAttributeMap(){ +/* + ++++ Amino acid names and numbers ++++ +Every line starts with the one-letter-code, followed by their numeric representation for .arff files, +followed by their three-letter-code and finally their name. + A = 1 Ala Alanin + C = 3 Cys Cystein + D = 4 Asp Aspartat + E = 5 Glu Glutamat + F = 6 Phe Phenylalanin + G = 7 Gly Glycin + H = 8 His Histidin + I = 9 Ile Isoleucin + K = 11 Lys Lysin + L = 12 Leu Leucin + M = 13 Met Methionin + N = 14 Asn Asparagin + O = 15 Pyl Pyrrolysin + P = 16 Pro Prolin + Q = 17 Gln Glutamin + R = 18 Arg Arginin + S = 19 Ser Serin + T = 20 Thr Threonin + U = 21 Sec Selenocystein + V = 22 Val Valin + W = 23 Trp Tryptophan + Y = 25 Tyr Tyrosin +*/ + public static HashMap<String, String> getAminoAcidNumber(){ + HashMap<String,String> resnum = new HashMap<String, String>(30); + resnum.put(new String("A"), new String("1")); + resnum.put(new String("C"), new String("3")); + resnum.put(new String("D"), new String("4")); + resnum.put(new String("E"), new String("5")); + resnum.put(new String("F"), new String("6")); + resnum.put(new String("G"), new String("7")); + resnum.put(new String("H"), new String("8")); + resnum.put(new String("I"), new String("9")); + resnum.put(new String("K"), new String("11")); + resnum.put(new String("L"), new String("12")); + resnum.put(new String("M"), new String("13")); + resnum.put(new String("N"), new String("14")); + resnum.put(new String("O"), new String("15")); + resnum.put(new String("P"), new String("16")); + resnum.put(new String("Q"), new String("17")); + resnum.put(new String("R"), new String("18")); + resnum.put(new String("S"), new String("19")); + resnum.put(new String("T"), new String("20")); + resnum.put(new String("U"), new String("21")); + resnum.put(new String("V"), new String("22")); + resnum.put(new String("W"), new String("23")); + resnum.put(new String("Y"), new String("25")); + + return resnum; + } + + public static HashMap<String, String> getAminoAcidNumericArffAttributeMap(){ + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + // Polarity unpolar = 0; polar = 1; positive = 2; negative = 3; + // Size Tiny = 0; Small = 1; Large = 2; HashMap<String, String> resdata = new HashMap<String, String>(30); - resdata.put(new String("A"), new String("2,0,0.5,?,?")); - resdata.put(new String("C"), new String("1,0,1,?,0")); - resdata.put(new String("D"), new String("0,-1,1,?,-1")); - resdata.put(new String("E"), new String("0,-1,2,?,-1")); - resdata.put(new String("F"), new String("2,0,2,1,?")); - resdata.put(new String("G"), new String("2,0,0.5,?,?")); - resdata.put(new String("H"), new String("1,1,2,1,1")); - resdata.put(new String("I"), new String("2,0,2,0,?")); - resdata.put(new String("K"), new String("1,1,2,?,1")); - resdata.put(new String("L"), new String("2,0,2,0,?")); - resdata.put(new String("M"), new String("2,0,2,?,?")); - resdata.put(new String("N"), new String("0,0,1,?,0")); - resdata.put(new String("P"), new String("?,0,1,?,?")); - resdata.put(new String("Q"), new String("0,0,2,?,0")); - resdata.put(new String("R"), new String("0,1,2,?,1")); - resdata.put(new String("S"), new String("0,0,0.5,?,0")); - resdata.put(new String("T"), new String("1,0,1,?,0,")); - resdata.put(new String("V"), new String("2,0,1,0,?")); - resdata.put(new String("W"), new String("1,0,2,1,1")); - resdata.put(new String("Y"), new String("1,0,2,1,0")); - resdata.put(new String("U"), new String("?,?,?,?,?")); + resdata.put(new String("A"), new String("1,0,0")); + resdata.put(new String("C"), new String("1,1,0")); + resdata.put(new String("D"), new String("0,3,1")); + resdata.put(new String("E"), new String("0,3,2")); + resdata.put(new String("F"), new String("2,0,2")); + resdata.put(new String("G"), new String("1,0,0")); + resdata.put(new String("H"), new String("2,2,2")); + resdata.put(new String("I"), new String("3,0,2")); + resdata.put(new String("K"), new String("1,2,2")); + resdata.put(new String("L"), new String("3,0,2")); + resdata.put(new String("M"), new String("1,0,2")); + resdata.put(new String("N"), new String("0,1,1")); + resdata.put(new String("O"), new String("?,?,?")); + resdata.put(new String("P"), new String("0,0,1")); + resdata.put(new String("Q"), new String("0,1,2")); + resdata.put(new String("R"), new String("0,2,2")); + resdata.put(new String("S"), new String("0,1,0")); + resdata.put(new String("T"), new String("1,1,1")); + resdata.put(new String("V"), new String("3,0,1")); + resdata.put(new String("W"), new String("2,1,2")); + resdata.put(new String("X"), new String("?,?,?")); // unknown residue (e.g. modified amino acids) + resdata.put(new String("Y"), new String("2,1,2")); + resdata.put(new String("U"), new String("?,?,?")); + return resdata; + } + + public static HashMap<String, String> getAminoAcidNominalArffAttributeMap(){ + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + // Polarity unpolar = 0 polar = 1; positive = 2; negative = 3; + // Size Tiny = 0; Small = 1; Large = 2; + HashMap<String, String> resdata = new HashMap<String, String>(30); + + resdata.put(new String("A"), new String("Hydrophobic,Unpolar,Tiny")); + resdata.put(new String("C"), new String("Hydrophobic,Polar,Tiny")); + resdata.put(new String("D"), new String("Hydrophilic,Negative,Small")); + resdata.put(new String("E"), new String("Hydrophilic,Negative,Large")); + resdata.put(new String("F"), new String("Aromatic,Unpolar,Large")); + resdata.put(new String("G"), new String("Hydrophobic,Unpolar,Tiny")); + resdata.put(new String("H"), new String("Aromatic,Positive,Large")); + resdata.put(new String("I"), new String("Aliphatic,Unpolar,Large")); + resdata.put(new String("K"), new String("Hydrophobic,Positive,Large")); + resdata.put(new String("L"), new String("Aliphatic,Unpolar,Large")); + resdata.put(new String("M"), new String("Hydrophobic,Unpolar,Large")); + resdata.put(new String("N"), new String("Hydrophilic,Polar,Small")); + resdata.put(new String("O"), new String("?,?,?")); + resdata.put(new String("P"), new String("Hydrophilic,Unpolar,Small")); + resdata.put(new String("Q"), new String("Hydrophilic,Polar,Large")); + resdata.put(new String("R"), new String("Hydrophilic,Positive,Large")); + resdata.put(new String("S"), new String("Hydrophilic,Polar,Tiny")); + resdata.put(new String("T"), new String("Hydrophobic,Polar,Small")); + resdata.put(new String("V"), new String("Aliphatic,Unpolar,Small")); + resdata.put(new String("W"), new String("Aromatic,Polar,Large")); + resdata.put(new String("X"), new String("?,?,?")); // unknown residue (e.g. modified amino acids) + resdata.put(new String("Y"), new String("Aromatic,Polar,Large")); + resdata.put(new String("U"), new String("?,?,?")); return resdata; } - - } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -20,8 +20,6 @@ import org.dllearner.core.ComponentInitException; import org.dllearner.core.LearningProblemUnsupportedException; - -import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; @@ -176,7 +174,8 @@ if (arff) { ResIterator niter = trainmodel.getFirstAA(); - createArffFile(pdbDir, trainmodel, niter); + createNumericArffFile(pdbDir, trainmodel, niter); + createNominalArffFile(pdbDir, trainmodel, niter); } /* @@ -361,7 +360,7 @@ // add knowledge source definition to <PDB ID>.conf files confFile.println(ks); - + // add knowledge source definition to <PDB ID>_<Amino Acid>.conf files Iterator<Resource> resources = resprint.keySet().iterator(); while (resources.hasNext()){ @@ -451,13 +450,14 @@ } } - private static void createArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + private static void createNumericArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ try { PDBProtein protein = model.getProtein(); String arffFilePath = pdbDir + protein.getArffFileName(); + arffFilePath = arffFilePath.replace(".arff", ".numeric.arff"); PrintStream out = new PrintStream (arffFilePath); - _logger.debug("Creating ARFF file: " + arffFilePath); + _logger.debug("Creating numeric ARFF file: " + arffFilePath); /* * RELATION @@ -470,14 +470,16 @@ * ATTRIBUTES */ // Integer declaring Position in chain - StringBuffer attributes = new StringBuffer("@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 - "@ATTRIBUTE charge NUMERIC\n" + // Negative = -1; Neutral = 0; Positive = 1 - "@ATTRIBUTE size NUMERIC\n" + // Large = 2; Small = 1; Tiny = 0.5 - "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 - "@ATTRIBUTE hydrogen_bonding NUMERIC\n"); // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 + StringBuffer attributes = new StringBuffer( + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + "@ATTRIBUTE hydrophobicity NUMERIC\n" + + // Polarity unpolar = 0 polar = 1; positive = 2; negative = 3; + "@ATTRIBUTE polarity NUMERIC\n" + + // Size Tiny = 0; Small = 1; Large = 2; + "@ATTRIBUTE size NUMERIC\n"); for (int i = -8; i <= 8; i++) { - attributes.append("@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"); // amino acid at position $i from current amino acid + attributes.append("@ATTRIBUTE aa_position_" + i + " NUMERIC\n"); // amino acid at position $i from current amino acid } attributes.append("@ATTRIBUTE in_helix NUMERIC\n"); // Helix = 1 Other = 0 @@ -492,14 +494,14 @@ out.println(data); // HashMap containing information about the properties of every amino acid - HashMap<String, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + HashMap<String, String> resdata = AminoAcids.getAminoAcidNumericArffAttributeMap(); + HashMap<String, String> resnum = AminoAcids.getAminoAcidNumber(); ArrayList<Resource> positives = model.getPositives(); ArrayList<Resource> negatives = model.getNegatives(); - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); String sequence = protein.getSequence(); HashMap<Integer, Resource> posRes = model.getPositionResource(); + for ( int i = 0; i < sequence.length(); i++) { StringBuffer dataLine = new StringBuffer(""); String key = Character.toString( sequence.charAt(i) ); @@ -509,13 +511,14 @@ dataLine.append( resdata.get(key) + "," ); } else { // - dataLine.append( resdata.get("U") + "," ); + dataLine.append( resdata.get("X") + "," ); } // add information about neighbouring amino acids to dataLine for (int j = (i - 8); j <= (i + 8) ; j++){ try { - dataLine.append( protein.getSequence().charAt(j) + "," ); + dataLine.append( resnum.get( + Character.toString(protein.getSequence().charAt(j))) + "," ); } catch (IndexOutOfBoundsException e) { dataLine.append( "?," ); } @@ -538,50 +541,96 @@ } catch (FileNotFoundException e){ e.printStackTrace(); } + } + + private static void createNominalArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + + try { + PDBProtein protein = model.getProtein(); + String arffFilePath = pdbDir + protein.getArffFileName(); + arffFilePath = arffFilePath.replace(".arff", ".nominal.arff"); + PrintStream out = new PrintStream (arffFilePath); + _logger.debug("Creating nominal ARFF file: " + arffFilePath); + + /* + * RELATION + */ + String relation = "@RELATION " + protein.getPdbID(); + out.println(relation); + _logger.debug(relation); + /* + * ATTRIBUTES + */ + // Integer declaring Position in chain + StringBuffer attributes = new StringBuffer( + // Hydrophobicity hydrophilic = 0; hydrophobic = 1; aromatic = 2; aliphatic = 3 + "@ATTRIBUTE hydrophob {hydrophilic, hydrophobic, aromatic, aliphatic}\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 + // Polarity unpolar = 0; polar = 1; positive = 2; negative = 3; + "@ATTRIBUTE charge {unpolar, polar, positive, negative}\n" + // Negative = -1; Neutral = 0; Positive = 1 + // Size tiny = 0; small = 1; large = 2; + "@ATTRIBUTE size {tiny, small, large}\n"); + + for (int i = -8; i <= 8; i++) { + attributes.append("@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"); // amino acid at position $i from current amino acid + } + attributes.append("@ATTRIBUTE in_helix {Helix, Non_helix}\n"); // Helix = 1 Other = 0 + + _logger.debug(attributes); + out.println(attributes); -/* // to be exchanged - while (firstAAs.hasNext()){ - Resource firstAA = firstAAs.next(); - Resource currentAA = firstAA; - Resource nextAA = firstAA; + /* + * @DATA + */ + String data = "@DATA\n"; + _logger.debug(data); + out.println(data); + + // HashMap containing information about the properties of every amino acid + HashMap<String, String> resdata = AminoAcids.getAminoAcidNominalArffAttributeMap(); + ArrayList<Resource> positives = model.getPositives(); + ArrayList<Resource> negatives = model.getNegatives(); + String sequence = protein.getSequence(); + HashMap<Integer, Resource> posRes = model.getPositionResource(); + + for ( int i = 0; i < sequence.length(); i++) { + StringBuffer dataLine = new StringBuffer(""); + String key = Character.toString( sequence.charAt(i) ); + + // add amino acid description to dataLine + if ( resdata.containsKey(key) ){ + dataLine.append( resdata.get(key) + "," ); + } else { + // + dataLine.append( resdata.get("X") + "," ); + } + // add information about neighbouring amino acids to dataLine + for (int j = (i - 8); j <= (i + 8) ; j++){ + try { + dataLine.append( protein.getSequence().charAt(j) + "," ); + } catch (IndexOutOfBoundsException e) { + dataLine.append( "?," ); + } + } + // add information about positive or negative to dataLine + if (positives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "Helix" ); + } else if (negatives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "Non_helix" ); + } else { + dataLine.append( "?" ); + } - for ( int i = 0; currentAA.hasProperty(iib); i++ ) { - StringBuffer dataLine = new StringBuffer(""); - currentAA = nextAA; - - NodeIterator niter = model.getModel().listObjectsOfProperty(currentAA, type); - while (niter.hasNext()){ - Resource key = niter.next().asResource(); - if (resdata.containsKey(key)){ - dataLine.append( resdata.get(key) + "," ); - } - } - - for (int j = (i - 8); j <= (i + 8) ; j++){ - try { - dataLine.append( protein.getSequence().charAt(j) + "," ); - } catch (IndexOutOfBoundsException e) { - dataLine.append( "?," ); - } - } - - if (positives.contains(currentAA)){ - dataLine.append( "1" ); - } else if (negatives.contains(currentAA)){ - dataLine.append( "0" ); - } else { - dataLine.append( "?" ); - } + _logger.info(dataLine); + out.println(dataLine); + + } + + } catch (FileNotFoundException e){ + e.printStackTrace(); + } + } - - - // get next AA if there is one - if (model.getModel().contains(currentAA, iib)){ - nextAA = model.getModel().getProperty(currentAA, iib).getResource(); - } -*/ - } } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -52,7 +52,7 @@ this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); System.out.println("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); - _positionResource = createPositionResidueMap(); + this._positionResource = createPositionResidueMap(); } public PdbRdfModel getModel(){ @@ -160,15 +160,15 @@ Resource polymerSequence = ResourceFactory.createResource("http://bio2rdf.org/pdb:PolymerSequence"); ResIterator riter = model.listResourcesWithProperty(type, polymerSequence); - while (riter.hasNext()){ - Resource nextRes = riter.next(); + while (riter.hasNext()) { + Resource nextRes = riter.nextResource(); if (model.contains(nextRes, hasValue)){ NodeIterator niter = model.listObjectsOfProperty(nextRes, hasValue); sequence = niter.next().toString(); System.out.println("Sequence: " + sequence); } - } + } ; return sequence; } @@ -230,7 +230,6 @@ // a amino acid is followed " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . " + " ?x5 pdb:hasChainPosition ?x8 ." + - " ?x8 rdfs:label ?residuePosition ." + " ?x8 pdb:hasValue ?x9 Filter (xsd:int(?x9)) ."; if (chainID.length() == 1 && pdbID.length() == 4) { @@ -283,7 +282,7 @@ Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); ResIterator firstAAs = this.getFirstAA(); - while ( firstAAs.hasNext()){ + while ( firstAAs.hasNext()) { Resource firstAA = firstAAs.next(); Resource currentAA = firstAA; posres.put(new Integer(this.getResiduePosition(currentAA)), currentAA); @@ -311,8 +310,7 @@ while ( positionLabelNodes.hasNext() ) { positionLabels.add(positionLabelNodes.next().asLiteral().getInt()); } - - } + } Integer position = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-11-28 12:11:57
|
Revision: 3444 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3444&view=rev Author: kmpf Date: 2011-11-28 12:11:47 +0000 (Mon, 28 Nov 2011) Log Message: ----------- Logger configuration changed. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-28 11:22:01 UTC (rev 3443) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-28 12:11:47 UTC (rev 3444) @@ -84,6 +84,7 @@ resprint.put(TRP, new PrintStream(allConfFiles.get(TRP))); resprint.put(TYR, new PrintStream(allConfFiles.get(TYR))); resprint.put(SEL, new PrintStream(allConfFiles.get(SEL))); + resprint.put(HYT, new PrintStream(allConfFiles.get(HYT))); } catch (FileNotFoundException e) { e.printStackTrace(); } @@ -114,6 +115,7 @@ resourceString.put(TRP, new StringBuffer(init)); resourceString.put(TYR, new StringBuffer(init)); resourceString.put(SEL, new StringBuffer(init)); + resourceString.put(HYT, new StringBuffer(init)); return resourceString; } @@ -172,6 +174,14 @@ return resnum; } + /* + * +++ Amino acid properties +++ + * + * the following amino acid properties were gathered from + * http://www.russelllab.org/aas/ + * + */ + public static HashMap<String, String> getAminoAcidNumericArffAttributeMap(){ // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 // Polarity unpolar = 0; polar = 1; positive = 2; negative = 3; @@ -232,7 +242,6 @@ resdata.put(new String("X"), new String("?,?,?")); // unknown residue (e.g. modified amino acids) resdata.put(new String("Y"), new String("Aromatic,Polar,Large")); resdata.put(new String("U"), new String("?,?,?")); - return resdata; } } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-28 11:22:01 UTC (rev 3443) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-28 12:11:47 UTC (rev 3444) @@ -53,7 +53,7 @@ // console, you have to set the threshold and log level to trace // (but we recommend just setting the log level to trace and observe // the log file) - consoleAppender.setThreshold(Level.DEBUG); + consoleAppender.setThreshold(Level.INFO); // logger 2 is writes to a file; it records all debug messages // (you can choose HTML or TXT) @@ -66,11 +66,12 @@ fileName = _dataDir + "log/log.html"; } else { // simple variant: layout2 = new SimpleLayout(); - layout2 = new PatternLayout("%r [%t] %-5p %c :\n%m%n\n"); + layout2 = new PatternLayout("%d [%t] %-5p %c : %m%n"); fileName = _dataDir + "log/log.txt"; } try { fileAppenderNormal = new FileAppender(layout2, fileName, false); + fileAppenderNormal.setThreshold(Level.INFO); } catch (IOException e) { e.printStackTrace(); } @@ -79,7 +80,7 @@ _rootLogger.removeAllAppenders(); _rootLogger.addAppender(consoleAppender); _rootLogger.addAppender(fileAppenderNormal); - _rootLogger.setLevel(Level.DEBUG); + _rootLogger.setLevel(Level.INFO); Boolean fasta = true; @@ -102,12 +103,12 @@ Boolean dlLearn = false; Boolean wekaLearn = false; - int dataSet = 5; + int dataSet = 1; /* * data for test purpose */ - PDBProtein testProtein = new PDBProtein("1XFF","A"); + PDBProtein testProtein = new PDBProtein("1EDM","B"); // PDBProtein testProtein = new PDBProtein("1LMB", "3"); // PDBProtein testProtein = new PDBProtein("8ABP"); @@ -146,6 +147,7 @@ if (rdfConf || arff) { PDBProtein protein = proteinSet.getProteinset().get(i); + _logger.info("Start with extracting data from: " + protein.getPdbID()); String pdbDir = _dataDir + protein.getPdbID() + "/"; File directory = new File(pdbDir); if(! directory.exists()) directory.mkdir(); @@ -153,7 +155,7 @@ //String arffFilePath = pdbDir + protein.getArffFileName(); _logger.info("PDB ID: " + protein.getPdbID()); - _logger.info("chain ID: " + protein.getChainID()); + _logger.info("Chain ID: " + protein.getChainID()); trainmodel = new PDBIdRdfModel(protein); @@ -189,6 +191,8 @@ trainmodel.removeStatementsWithObject(residue); Property isPartOf = ResourceFactory.createProperty("http://purl.org/dc/terms/", "isPartOf"); trainmodel.removeStatementsWithPoperty(isPartOf); + Property hasValue = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "hasValue"); + trainmodel.removeStatementsWithPoperty(hasValue); /* * we add the information which amino acid is the fourth predecessor of which other amino acid */ @@ -533,7 +537,7 @@ dataLine.append( "?" ); } - _logger.info(dataLine); + _logger.debug(dataLine); out.println(dataLine); } @@ -623,7 +627,7 @@ dataLine.append( "?" ); } - _logger.info(dataLine); + _logger.debug(dataLine); out.println(dataLine); } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-28 11:22:01 UTC (rev 3443) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-28 12:11:47 UTC (rev 3444) @@ -34,7 +34,7 @@ public class PDBIdRdfModel { - private static Logger _logger = Logger.getRootLogger(); + private static Logger _logger = Logger.getLogger(HelixRDFCreator.class); private PdbRdfModel _pdbIdModel = new PdbRdfModel(); private PdbRdfModel _removedFromModel = new PdbRdfModel(); @@ -48,9 +48,9 @@ this._protein = protein; this._pdbIdModel = this.getPdbRdfModel(); this.getProtein().setSequence(extractSequence(_pdbIdModel)); - System.out.println("Sequence: " + this.getProtein().getSequence()); + _logger.info("Sequence: " + this.getProtein().getSequence()); this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); - System.out.println("Species: " + this.getProtein().getSpecies()); + _logger.info("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); this._positionResource = createPositionResidueMap(); } @@ -122,7 +122,7 @@ " ?x1 pdb:isImmediatelyBefore ?x4 ." + " OPTIONAL { ?x5 rdfs:label ?species FILTER (str(?x5) = fn:concat(str(?x2), '/extraction/source/gene/organism')) . } . }"; - // System.out.println(queryString); + _logger.debug(queryString); PdbRdfModel construct = new PdbRdfModel(); Query query = QueryFactory.create(queryString); @@ -138,10 +138,7 @@ { RDFNode nextRes = niter.next(); species = nextRes.toString(); -/* QuerySolution soln = results.nextSolution() ; - Literal l = soln.getLiteral("species") ; // Get a result variable - must be a literal - species = l.getString();*/ - System.out.println(species); + _logger.debug(species); } } finally @@ -166,7 +163,7 @@ NodeIterator niter = model.listObjectsOfProperty(nextRes, hasValue); sequence = niter.next().toString(); - System.out.println("Sequence: " + sequence); + _logger.debug("Sequence: " + sequence); } } ; return sequence; @@ -219,9 +216,10 @@ " ?organism rdfs:label ?organismName ." + " ?seq rdf:type pdb:PolymerSequence ." + " ?seq pdb:hasValue ?sequence . } " + - "WHERE { ?x1 rdf:type pdb:Helix ." + + "WHERE { " + + " OPTIONAL { ?x1 rdf:type pdb:Helix ." + " ?x1 pdb:beginsAt ?x2 ." + - " ?x1 pdb:endsAt ?x3 ." + + " ?x1 pdb:endsAt ?x3 . } . " + " ?x3 dcterms:isPartOf ?x4 ." + " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + " ?x5 dcterms:isPartOf ?x4 ." + @@ -245,7 +243,7 @@ " OPTIONAL { ?organism rdfs:label ?organismName " + "FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . } . }"; - System.out.println(queryString); + _logger.debug(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, model); construct.add(qe.execConstruct()); @@ -318,7 +316,6 @@ position = positionLabels.get(0); } else { position = new Integer(0); - _logger.error(""); } return position.intValue(); } @@ -347,7 +344,7 @@ "PREFIX x:<" + prop.getNameSpace() + "> " + "CONSTRUCT { ?x1 x:" + prop.getLocalName()+ " ?x2 . } " + "WHERE { ?x1 x:" + prop.getLocalName() + " ?x2 . }"; - //System.out.println(queryString); + _logger.debug(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, _pdbIdModel); StmtIterator stmtiter = qe.execConstruct().listStatements(); @@ -365,7 +362,7 @@ "PREFIX x:<" + res.getNameSpace() + "> " + "CONSTRUCT { ?x1 ?x2 x:" + res.getLocalName() + " . } " + "WHERE { ?x1 ?x2 x:" + res.getLocalName() + " . }"; - // System.out.println(queryString); + _logger.debug(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, _pdbIdModel); StmtIterator stmtiter = qe.execConstruct().listStatements(); @@ -392,7 +389,7 @@ while (riter.hasNext()) { // Initialization of variables needed Resource firstAA = riter.nextResource(); - System.out.println("First AA: " + firstAA.getLocalName()); + _logger.debug("First AA: " + firstAA.getLocalName()); Resource currentAA = firstAA; Resource nextAA = firstAA; boolean inHelix = false; @@ -427,15 +424,15 @@ } while (currentAA.hasProperty(iib)) ; } _positives = pos; - System.out.println("+++ Positive set +++"); + _logger.debug("+++ Positive set +++"); for (int i = 0; i < pos.size(); i++){ - System.out.println("Das " + i + "te Element: " + pos.get(i).getLocalName()); + _logger.debug("Das " + i + "te Element: " + pos.get(i).getLocalName()); } _negatives = neg; - System.out.println("+++ Negatvie set +++"); + _logger.debug("+++ Negatvie set +++"); for (int i = 0; i < neg.size(); i++){ - System.out.println("Das " + i + "te Element: " + neg.get(i).getLocalName()); + _logger.debug("Das " + i + "te Element: " + neg.get(i).getLocalName()); } } @@ -473,84 +470,4 @@ this.getProtein().setFastaFileName(fastaFileName); this.createFastaFile(dir); } - - - /* - * OLD STUFF - * - // every element in riter stands for a AA-chain start - // every first amino acid indicates a new AA-chain - while (riter.hasNext()) - { - // Initialization of variables needed - int i = 0; - Resource aaOne = riter.nextResource(); - Resource currentaa = aaOne; - Resource nextaa = aaOne; - boolean inHelix = false; - _logger.debug(currentaa.getURI()); - // look if there is a next AA - do { - ++i; - _logger.debug(i); - //looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property - currentaa = nextaa; - NodeIterator resType = model.listObjectsOfProperty(currentaa,type); - - // die Guten ins Töpfchen ... - // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-helix - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ) - { - inHelix = true; - } - // die Schlechten ins Kröpfchen - // if we get an non-empty iterator for pdb:endsAt and are already within a AA-helix - // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix) - { - inHelix = false; - } - // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()) - { - nextaa = model.getProperty(currentaa, iib).getResource(); - } - - // add current amino acid to positives or negatives set - while(resType.hasNext()) - { - Resource aaType = resType.next().asResource(); - _logger.info(aaType.getURI()); - if (resdata.get(aaType) != null) - { - if (inHelix) - { - data += i + "," + 1 + "," + resdata.get(aaType); - } - else - { - data += i + "," + 0 + "," + resdata.get(aaType); - } - } - } - - } while (currentaa.hasProperty(iib)) ; - } - - try - { - PrintStream out = new PrintStream (new File(arffFilePath)); - out.println(relation); - out.print(attribute); - out.print(data); - out.close(); - } - catch (FileNotFoundException e ) - { - System.err.println("Datei " + arffFilePath + " konnte nicht angelegt werden!"); - e.printStackTrace(); - } - - - */ } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java 2011-11-28 11:22:01 UTC (rev 3443) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java 2011-11-28 12:11:47 UTC (rev 3444) @@ -11,8 +11,12 @@ import java.util.HashMap; import java.util.Random; +import org.apache.log4j.Logger; + public class ProteinDataSet { + private static Logger _logger = Logger.getLogger(HelixRDFCreator.class); + private static String _dataDir = "../test/pdb/"; @@ -71,11 +75,11 @@ pdbproteins.close(); // get number of lines int linenr = lines.size(); - System.out.println("File "+ pdbIDlist.getCanonicalPath() + " has " + linenr + " lines."); + _logger.info("File "+ pdbIDlist.getCanonicalPath() + " has " + linenr + " lines."); this._proteinSet = new ArrayList<PDBProtein>(linenr); for (int i = 0; i < linenr; i++) { - System.out.println("LINES element " + i + " contains " + lines.get(i)); + _logger.info("LINES element " + i + " contains " + lines.get(i)); this._proteinSet.add( new PDBProtein( this.getPdbID(i, lines), @@ -85,7 +89,7 @@ } catch (IOException e) { - System.err.println("File " + pdbIDlist.getAbsolutePath() + " could not be read in!"); + _logger.error("File " + pdbIDlist.getAbsolutePath() + " could not be read in!"); // TODO Auto-generated catch block e.printStackTrace(); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <km...@us...> - 2011-12-21 14:38:51
|
Revision: 3513 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3513&view=rev Author: kmpf Date: 2011-12-21 14:38:40 +0000 (Wed, 21 Dec 2011) Log Message: ----------- Changed a SPARQL query Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-12-21 10:15:12 UTC (rev 3512) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-12-21 14:38:40 UTC (rev 3513) @@ -31,6 +31,7 @@ public static final Resource TYR = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); public static final Resource SEL = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); public static final Resource HYT = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); + public static final Resource SOC = ResourceFactory.createResource("http://bio2rdf.org/pdb:S-oxyCysteine"); public static HashMap<Resource, File> getAllConfFiles (String dir, String confFileName){ HashMap<Resource, File> aminoAcidsConfFiles = new HashMap<Resource, File>(30); @@ -56,6 +57,8 @@ aminoAcidsConfFiles.put(TYR, new File(dir + confFileName.replace(".conf", "." + TYR.getLocalName() + ".conf"))); aminoAcidsConfFiles.put(SEL, new File(dir + confFileName.replace(".conf", "." + SEL.getLocalName() + ".conf"))); aminoAcidsConfFiles.put(HYT, new File(dir + confFileName.replace(".conf", "." + HYT.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(SOC, new File(dir + confFileName.replace(".conf", "." + SOC.getLocalName() + ".conf"))); + return aminoAcidsConfFiles; } @@ -85,6 +88,8 @@ resprint.put(TYR, new PrintStream(allConfFiles.get(TYR))); resprint.put(SEL, new PrintStream(allConfFiles.get(SEL))); resprint.put(HYT, new PrintStream(allConfFiles.get(HYT))); + resprint.put(SOC, new PrintStream(allConfFiles.get(SOC))); + } catch (FileNotFoundException e) { e.printStackTrace(); } @@ -116,6 +121,7 @@ resourceString.put(TYR, new StringBuffer(init)); resourceString.put(SEL, new StringBuffer(init)); resourceString.put(HYT, new StringBuffer(init)); + resourceString.put(SOC, new StringBuffer(init)); return resourceString; } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-12-21 10:15:12 UTC (rev 3512) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-12-21 14:38:40 UTC (rev 3513) @@ -103,15 +103,19 @@ Boolean dlLearn = false; Boolean wekaLearn = false; - int dataSet = 1; + int dataSet = 5; /* * data for test purpose */ - PDBProtein testProtein = new PDBProtein("1EDM","B"); -// PDBProtein testProtein = new PDBProtein("1LMB", "3"); -// PDBProtein testProtein = new PDBProtein("8ABP"); + /* +++ Problem IDs +++ + * Warum funktionieren die Abfragen mit den untenstehenden PDB IDs nicht??? + */ +// PDBProtein testProtein = new PDBProtein("1HTR","P"); +// PDBProtein testProtein = new PDBProtein("2W9Y","A"); + PDBProtein testProtein = new PDBProtein("3A4R","A"); + /* * create a training data set @@ -411,6 +415,7 @@ for (int i = 0 ; i < negatives.size() ; i++ ) { lp.append("\"" + negatives.get(i).getURI() + "\", "); try{ + _logger.info("Negative residue: " + negatives.get(i).getURI()); Statement spo = model.getModel().getProperty(negatives.get(i), type); resourceStringBuffer.get(spo.getResource()).append("\"" + negatives.get(i).getURI() + "\", "); } catch (NullPointerException e) { Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-12-21 10:15:12 UTC (rev 3512) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-12-21 14:38:40 UTC (rev 3513) @@ -7,11 +7,7 @@ import java.util.ArrayList; import java.util.HashMap; -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; import org.apache.log4j.Logger; -import org.apache.log4j.SimpleLayout; import org.xml.sax.InputSource; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; @@ -22,7 +18,6 @@ import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; -import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; @@ -205,30 +200,36 @@ "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> " + - "CONSTRUCT { ?x1 pdb:beginsAt ?x2 ." + - " ?x1 pdb:endsAt ?x3 ." + - " ?x5 dcterms:isPartOf ?x4 ." + - " ?x5 rdf:type ?x6 ." + - " ?x5 pdb:isImmediatelyBefore ?x7 ." + - " ?x5 pdb:hasChainPosition ?x8 ." + - " ?x8 rdfs:label ?residuePosition ." + - " ?x8 pdb:hasValue ?x9 ." + - " ?organism rdfs:label ?organismName ." + - " ?seq rdf:type pdb:PolymerSequence ." + - " ?seq pdb:hasValue ?sequence . } " + + "CONSTRUCT { " + + " ?x1 pdb:beginsAt ?x2 ." + + " ?x1 pdb:endsAt ?x3 ." + + " ?x5 dcterms:isPartOf ?x4 ." + + " ?x5 rdf:type ?x6 ." + + " ?x5 pdb:isImmediatelyBefore ?x7 ." + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 pdb:hasValue ?x9 ." + + " ?organism rdfs:label ?organismName ." + + " ?seq rdf:type pdb:PolymerSequence ." + + " ?seq pdb:hasValue ?sequence . " + + "} " + "WHERE { " + - " OPTIONAL { ?x1 rdf:type pdb:Helix ." + - " ?x1 pdb:beginsAt ?x2 ." + - " ?x1 pdb:endsAt ?x3 . } . " + - " ?x3 dcterms:isPartOf ?x4 ." + - " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 dcterms:isPartOf ?x4 ." + - " ?x5 rdf:type ?x6 ." + + " OPTIONAL { ?x1 rdf:type pdb:Helix ." + + " ?x1 pdb:beginsAt ?x2 ." + + " ?x1 pdb:endsAt ?x3 . " + + "} . " + + " ?x3 dcterms:isPartOf ?x4 ." + + " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + //" <http://bio2rdf.org/pdb:3A4R/chemicalComponent_A0> dcterms:isPartOf ?x4 ." + + //" <http://bio2rdf.org/pdb:3A4R/chemicalComponent_A0> rdf:type ?x6 ." + + //" OPTIONAL { <http://bio2rdf.org/pdb:3A4R/chemicalComponent_A0> pdb:isImmediatelyBefore ?x7 . } ." + + //" <http://bio2rdf.org/pdb:3A4R/chemicalComponent_A0> pdb:hasChainPosition ?x8 ." + + " ?x5 dcterms:isPartOf ?x4 . " + + " ?x5 rdf:type ?x6 ." + // with the optional clause i get the information by which amino acid // a amino acid is followed - " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . " + - " ?x5 pdb:hasChainPosition ?x8 ." + - " ?x8 pdb:hasValue ?x9 Filter (xsd:int(?x9)) ."; + " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } ." + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 pdb:hasValue ?x9 Filter (datatype((?x9)) = xsd:integer) ."; if (chainID.length() == 1 && pdbID.length() == 4) { queryString += @@ -237,11 +238,13 @@ "/chain_" + chainID.toUpperCase() + "> ."; } queryString += - " ?x4 pdb:hasPolymerSequence ?seq . " + - " ?seq rdf:type pdb:PolymerSequence . " + - " ?seq pdb:hasValue ?sequence . " + - " OPTIONAL { ?organism rdfs:label ?organismName " + - "FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . } . }"; + " ?x4 pdb:hasPolymerSequence ?seq . " + + " ?seq rdf:type pdb:PolymerSequence . " + + " ?seq pdb:hasValue ?sequence . " + + " OPTIONAL { ?organism rdfs:label ?organismName " + + "FILTER (str(?organism) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + + "} . " + + "}"; _logger.debug(queryString); Query query = QueryFactory.create(queryString); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |