From: <km...@us...> - 2011-10-26 07:52:41
|
Revision: 3320 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3320&view=rev Author: kmpf Date: 2011-10-26 07:52:30 +0000 (Wed, 26 Oct 2011) Log Message: ----------- Added a few classes for hopefully nicer code. Especially the two classes for DL-Learner and Weka code. Even though the Weka thing does not work at the moment. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java Added Paths: ----------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBDLLearner.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBProtein.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBWekaLearner.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/ProteinDataSet.java Removed Paths: ------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -0,0 +1,121 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.HashMap; + +import com.hp.hpl.jena.rdf.model.Resource; +import com.hp.hpl.jena.rdf.model.ResourceFactory; + +public final class AminoAcids { + public static final Resource ALA = ResourceFactory.createResource("http://bio2rdf.org/pdb:Alanine"); + public static final Resource CYS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Cysteine"); + public static final Resource ASP = ResourceFactory.createResource("http://bio2rdf.org/pdb:AsparticAcid"); + public static final Resource GLU = ResourceFactory.createResource("http://bio2rdf.org/pdb:GlutamicAcid"); + public static final Resource PHE = ResourceFactory.createResource("http://bio2rdf.org/pdb:Phenylalanine"); + public static final Resource GLY = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glycine"); + public static final Resource HIS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Histidine"); + public static final Resource ILE = ResourceFactory.createResource("http://bio2rdf.org/pdb:Isoleucine"); + public static final Resource LYS = ResourceFactory.createResource("http://bio2rdf.org/pdb:Lysine"); + public static final Resource LEU = ResourceFactory.createResource("http://bio2rdf.org/pdb:Leucine"); + public static final Resource MET = ResourceFactory.createResource("http://bio2rdf.org/pdb:Methionine"); + public static final Resource ASN = ResourceFactory.createResource("http://bio2rdf.org/pdb:Asparagine"); + public static final Resource PRO = ResourceFactory.createResource("http://bio2rdf.org/pdb:Proline"); + public static final Resource GLN = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glutamine"); + public static final Resource ARG = ResourceFactory.createResource("http://bio2rdf.org/pdb:Arginine"); + public static final Resource SER = ResourceFactory.createResource("http://bio2rdf.org/pdb:Serine"); + public static final Resource THR = ResourceFactory.createResource("http://bio2rdf.org/pdb:Threonine"); + public static final Resource VAL = ResourceFactory.createResource("http://bio2rdf.org/pdb:Valine"); + public static final Resource TRP = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tryptophan"); + public static final Resource TYR = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); + public static final Resource SEL = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); + public static final Resource HYT = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); + + public static HashMap<Resource, File> getAllConfFiles (String dir, String confFileName){ + HashMap<Resource, File> aminoAcidsConfFiles = new HashMap<Resource, File>(30); + aminoAcidsConfFiles.put(ALA, new File(dir + confFileName.replace(".conf", "." + ALA.getLocalName()) + ".conf")); + aminoAcidsConfFiles.put(CYS, new File(dir + confFileName.replace(".conf", "." + CYS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ASP, new File(dir + confFileName.replace(".conf", "." + ASP.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLU, new File(dir + confFileName.replace(".conf", "." + GLU.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(PHE, new File(dir + confFileName.replace(".conf", "." + PHE.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLY, new File(dir + confFileName.replace(".conf", "." + GLY.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(HIS, new File(dir + confFileName.replace(".conf", "." + HIS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ILE, new File(dir + confFileName.replace(".conf", "." + ILE.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(LYS, new File(dir + confFileName.replace(".conf", "." + LYS.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(LEU, new File(dir + confFileName.replace(".conf", "." + LEU.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(MET, new File(dir + confFileName.replace(".conf", "." + MET.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ASN, new File(dir + confFileName.replace(".conf", "." + ASN.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(PRO, new File(dir + confFileName.replace(".conf", "." + PRO.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(GLN, new File(dir + confFileName.replace(".conf", "." + GLN.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(ARG, new File(dir + confFileName.replace(".conf", "." + ARG.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(SER, new File(dir + confFileName.replace(".conf", "." + SER.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(THR, new File(dir + confFileName.replace(".conf", "." + THR.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(VAL, new File(dir + confFileName.replace(".conf", "." + VAL.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(TRP, new File(dir + confFileName.replace(".conf", "." + TRP.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(TYR, new File(dir + confFileName.replace(".conf", "." + TYR.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(SEL, new File(dir + confFileName.replace(".conf", "." + SEL.getLocalName() + ".conf"))); + aminoAcidsConfFiles.put(HYT, new File(dir + confFileName.replace(".conf", "." + HYT.getLocalName() + ".conf"))); + return aminoAcidsConfFiles; + } + + public static HashMap<Resource, PrintStream> getAminoAcidPrintStreamMap(HashMap<Resource, File> allConfFiles){ + // put all amino acid resources and the their conf-files together + HashMap<Resource, PrintStream> resprint = new HashMap<Resource, PrintStream>(30); + try{ + resprint.put(ALA, new PrintStream(allConfFiles.get(ALA))); + resprint.put(CYS, new PrintStream(allConfFiles.get(CYS))); + resprint.put(ASP, new PrintStream(allConfFiles.get(ASP))); + resprint.put(GLU, new PrintStream(allConfFiles.get(GLU))); + resprint.put(PHE, new PrintStream(allConfFiles.get(PHE))); + resprint.put(GLY, new PrintStream(allConfFiles.get(GLY))); + resprint.put(HIS, new PrintStream(allConfFiles.get(HIS))); + resprint.put(ILE, new PrintStream(allConfFiles.get(ILE))); + resprint.put(LYS, new PrintStream(allConfFiles.get(LYS))); + resprint.put(LEU, new PrintStream(allConfFiles.get(LEU))); + resprint.put(MET, new PrintStream(allConfFiles.get(MET))); + resprint.put(ASN, new PrintStream(allConfFiles.get(ASN))); + resprint.put(PRO, new PrintStream(allConfFiles.get(PRO))); + resprint.put(GLN, new PrintStream(allConfFiles.get(GLN))); + resprint.put(ARG, new PrintStream(allConfFiles.get(ARG))); + resprint.put(SER, new PrintStream(allConfFiles.get(SER))); + resprint.put(THR, new PrintStream(allConfFiles.get(THR))); + resprint.put(VAL, new PrintStream(allConfFiles.get(VAL))); + resprint.put(TRP, new PrintStream(allConfFiles.get(TRP))); + resprint.put(TYR, new PrintStream(allConfFiles.get(TYR))); + resprint.put(SEL, new PrintStream(allConfFiles.get(SEL))); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + return resprint; + } + + public static HashMap<Resource, String> getAminoAcidArffAttributeMap(){ + HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); + resdata.put(ALA, new String("2,0,0.5,?,?")); + resdata.put(CYS, new String("1,0,1,?,0")); + resdata.put(ASP, new String("0,-1,1,?,-1")); + resdata.put(GLU, new String("0,-1,2,?,-1")); + resdata.put(PHE, new String("2,0,2,1,?")); + resdata.put(GLY, new String("2,0,0.5,?,?")); + resdata.put(HIS, new String("1,1,2,1,1")); + resdata.put(ILE, new String("2,0,2,0,?")); + resdata.put(LYS, new String("1,1,2,?,1")); + resdata.put(LEU, new String("2,0,2,0,?")); + resdata.put(MET, new String("2,0,2,?,?")); + resdata.put(ASN, new String("0,0,1,?,0")); + resdata.put(PRO, new String("?,0,1,?,?")); + resdata.put(GLN, new String("0,0,2,?,0")); + resdata.put(ARG, new String("0,1,2,?,1")); + resdata.put(SER, new String("0,0,0.5,?,0")); + resdata.put(THR, new String("1,0,1,?,0,")); + resdata.put(VAL, new String("2,0,1,0,?")); + resdata.put(TRP, new String("1,0,2,1,1")); + resdata.put(TYR, new String("1,0,2,1,0")); + resdata.put(SEL, new String("?,?,?,?,?")); + + return resdata; + } + + +} Copied: trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java (from rev 3298, trunk/scripts/src/main/java/org/dllearner/examples/pdb/RdfFileFilter.java) =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/ConfFileFilter.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -0,0 +1,15 @@ +package org.dllearner.examples.pdb; + +import java.io.File; +import java.io.FileFilter; + +public class ConfFileFilter implements FileFilter +{ + private final String extension = new String("conf"); + + public boolean accept(File file) + { + return file.getName().toLowerCase().endsWith(extension); + } +} + Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-25 07:49:07 UTC (rev 3319) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-10-26 07:52:30 UTC (rev 3320) @@ -1,801 +1,389 @@ package org.dllearner.examples.pdb; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; import java.io.PrintStream; -import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Date; import java.util.HashMap; import java.util.Iterator; -import java.util.Set; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.HTMLLayout; +import org.apache.log4j.Layout; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.dllearner.cli.Start; import org.dllearner.core.ComponentInitException; import org.dllearner.core.ComponentManager; -import org.dllearner.core.AbstractKnowledgeSource; -import org.dllearner.core.AbstractReasonerComponent; +import org.dllearner.core.LearningProblemUnsupportedException; +import org.dllearner.core.ReasonerComponent; import org.dllearner.core.owl.Description; -import org.dllearner.core.owl.Individual; -import org.dllearner.kb.OWLFile; import org.dllearner.reasoning.FastInstanceChecker; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import com.hp.hpl.jena.query.Query; -import com.hp.hpl.jena.query.QueryExecution; -import com.hp.hpl.jena.query.QueryExecutionFactory; -import com.hp.hpl.jena.query.QueryFactory; -import com.hp.hpl.jena.query.QuerySolution; -import com.hp.hpl.jena.query.ResultSet; -import com.hp.hpl.jena.query.ResultSetFormatter; -import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.ResourceFactory; import com.hp.hpl.jena.rdf.model.Statement; -import com.hp.hpl.jena.rdf.model.StmtIterator; -import com.hp.hpl.jena.vocabulary.RDFS; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; -import com.dumontierlab.pdb2rdf.parser.PdbXmlParser; -import com.dumontierlab.pdb2rdf.util.Pdb2RdfInputIterator; -import com.dumontierlab.pdb2rdf.util.PdbsIterator; -import edu.stanford.nlp.io.EncodingPrintWriter.out; - - public class HelixRDFCreator { - private static Resource ala = ResourceFactory.createResource("http://bio2rdf.org/pdb:Alanine"); - private static Resource cys = ResourceFactory.createResource("http://bio2rdf.org/pdb:Cysteine"); - private static Resource asp = ResourceFactory.createResource("http://bio2rdf.org/pdb:AsparticAcid"); - private static Resource glu = ResourceFactory.createResource("http://bio2rdf.org/pdb:GlutamicAcid"); - private static Resource phe = ResourceFactory.createResource("http://bio2rdf.org/pdb:Phenylalanine"); - private static Resource gly = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glycine"); - private static Resource his = ResourceFactory.createResource("http://bio2rdf.org/pdb:Histidine"); - private static Resource ile = ResourceFactory.createResource("http://bio2rdf.org/pdb:Isoleucine"); - private static Resource lys = ResourceFactory.createResource("http://bio2rdf.org/pdb:Lysine"); - private static Resource leu = ResourceFactory.createResource("http://bio2rdf.org/pdb:Leucine"); - private static Resource met = ResourceFactory.createResource("http://bio2rdf.org/pdb:Methionine"); - private static Resource asn = ResourceFactory.createResource("http://bio2rdf.org/pdb:Asparagine"); - private static Resource pro = ResourceFactory.createResource("http://bio2rdf.org/pdb:Proline"); - private static Resource gln = ResourceFactory.createResource("http://bio2rdf.org/pdb:Glutamine"); - private static Resource arg = ResourceFactory.createResource("http://bio2rdf.org/pdb:Arginine"); - private static Resource ser = ResourceFactory.createResource("http://bio2rdf.org/pdb:Serine"); - private static Resource thr = ResourceFactory.createResource("http://bio2rdf.org/pdb:Threonine"); - private static Resource val = ResourceFactory.createResource("http://bio2rdf.org/pdb:Valine"); - private static Resource trp = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tryptophan"); - private static Resource tyr = ResourceFactory.createResource("http://bio2rdf.org/pdb:Tyrosine"); - private static Resource sel = ResourceFactory.createResource("http://bio2rdf.org/pdb:Selenomethionine"); - private static Resource hyt = ResourceFactory.createResource("http://bio2rdf.org/pdb:2-hydroxy-tryptophan"); + private static Logger _logger = Logger.getLogger(HelixRDFCreator.class); + private static Logger _rootLogger = Logger.getRootLogger(); - private static ArrayList<Resource> positives; - private static ArrayList<Resource> negatives; - - public void setNegatives(ArrayList<Resource> neg){ - negatives = neg; - } - - public ArrayList<Resource> getPositives(){ - return positives; - } - - public ArrayList<Resource> getNegatives(){ - return negatives; - } - - private static String dataDir = "../test/pdb/"; - private static HashMap<Resource, File> confFilePerResidue; - private static File confFileForAll; - - public void setPositives(ArrayList<Resource> pos){ - positives = pos; - } - + private static String _dataDir = "../test/pdb/"; + private static File _dir = new File(_dataDir); + /** * @param args * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { + + // create loggers (a simple logger which outputs + // its messages to the console and a log file) + + // logger 1 is the console, where we print only info messages; + // the logger is plain, i.e. does not output log level etc. + Layout layout = new PatternLayout(); + + ConsoleAppender consoleAppender = new ConsoleAppender(layout); + // setting a threshold suppresses log messages below this level; + // this means that if you want to e.g. see all trace messages on + // console, you have to set the threshold and log level to trace + // (but we recommend just setting the log level to trace and observe + // the log file) + consoleAppender.setThreshold(Level.DEBUG); + + // logger 2 is writes to a file; it records all debug messages + // (you can choose HTML or TXT) + boolean htmlLog = false; + Layout layout2 = null; + FileAppender fileAppenderNormal = null; + String fileName; + if(htmlLog) { + layout2 = new HTMLLayout(); + fileName = _dataDir + "log/log.html"; + } else { + // simple variant: layout2 = new SimpleLayout(); + layout2 = new PatternLayout("%r [%t] %-5p %c :\n%m%n\n"); + fileName = _dataDir + "log/log.txt"; + } + try { + fileAppenderNormal = new FileAppender(layout2, fileName, false); + } catch (IOException e) { + e.printStackTrace(); + } + + // add both loggers + _rootLogger.removeAllAppenders(); + _rootLogger.addAppender(consoleAppender); + _rootLogger.addAppender(fileAppenderNormal); + _rootLogger.setLevel(Level.DEBUG); + + + Boolean fasta = true; + /* - * test = true -> use test data - * test = false -> use data set 1, 2, 3 or 4 - */ - Boolean test = true; - /* * rdfConf = true -> write out the .rdf and .conf-Files * rdfConf = false -> does not generate those files */ Boolean rdfConf = true; + /* * arff = true -> write out .arff-Files * arff = false -> does not generate those files */ - Boolean arff = false; + Boolean arff = true; /* * load = true -> load alle .rdf, .conf and .arff Files that can be found within the directory dataDir * load = false -> don't load anything */ - Boolean load = true; - Boolean dlLearn = false; - Boolean wekaLearn = false; + Boolean load = false; + Boolean dlLearn = true; + Boolean wekaLearn = true; - int dataSet = 1; + int dataSet = 5; + /* - * get data set files - */ - // data set 1 - String bt426 = dataDir + "bt426.list"; - File bt426List = new File(bt426); - // data set 2 - String plp273 = dataDir + "plp273.list"; - File plp273List = new File(plp273); - // data set 3 - String plp364 = dataDir + "plp364.list"; - File plp364List = new File(plp364); - // data set 4 - String plp399 = dataDir + "plp399.list"; - File plp399List = new File(plp399); - - /* * data for test purpose */ - String pdbID = "1XFF"; - String chainID = "A"; - File dir = new File(dataDir); +// PdbProtein testProtein = new PdbProtein("1XFF"); + PDBProtein testProtein = new PDBProtein("1XFF", "A"); /* - * generate trainset and fill trainmodel + * create a training data set */ - PdbRdfModel trainmodel = new PdbRdfModel(); - TrainAndTestSet trainSet = new TrainAndTestSet(); + ProteinDataSet proteinSet; - if ( !test ) - { - switch (dataSet) { - case 1: - trainSet = new TrainAndTestSet(bt426List); - break; - case 2: - trainSet = new TrainAndTestSet(plp273List); - break; - case 3: - trainSet = new TrainAndTestSet(plp364List); - break; - case 4: - trainSet = new TrainAndTestSet(plp399List); - break; - } + switch (dataSet) { + case 1: + proteinSet = ProteinDataSet.bt426(); + break; + case 2: + proteinSet = ProteinDataSet.plp273(); + break; + case 3: + proteinSet = ProteinDataSet.plp364(); + break; + case 4: + proteinSet = ProteinDataSet.plp399(); + break; + default: + proteinSet = new ProteinDataSet(testProtein); + break; } - else - { - trainSet = new TrainAndTestSet(pdbID, chainID); - } - HashMap<String,File> rdfFiles = new HashMap<String,File>(); - HashMap<String,File> arffFiles = new HashMap<String,File>(); - - if (load) - { - System.out.println("Starting to load files in " + dataDir ); - File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); - for (File actDir : pdbDir) { - File[] rdfFilesInActDir = actDir.listFiles(new RdfFileFilter()); - try { - System.out.println("Looking for Files in " + actDir.getCanonicalPath() ); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - for (File rdfFile : rdfFilesInActDir) { - rdfFiles.put(rdfFile.getName().substring(0, 3), rdfFile); - System.out.println("Found RDF File for PDB ID " + rdfFile.getName().substring(0, 3) ); - System.out.println("Found RDF File " + rdfFile.getName() ); - } - File[] arffFilesInActDir = actDir.listFiles(new ArffFileFilter()); - for (File arffFile : arffFilesInActDir) { - arffFiles.put(arffFile.getName().substring(0, 3), arffFile); - System.out.println("Found RDF File for PDB ID " + arffFile.getName().substring(0, 3) ); - System.out.println("Found ARFF File " + arffFile.getName() ); - } - } - } - - - /* * generate a PdbRdfModel for every pdbID */ + PDBIdRdfModel trainmodel; - - - for (int i = 0; i < trainSet.getTrainset().length; i++) + for (int i = 0; i < proteinSet.getProteinset().size(); i++) { - System.out.println("pdbId: " + trainSet.getTrainset()[i].getPdbID()); - System.out.println("chainID: " + trainSet.getTrainset()[i].getChainID()); - trainmodel.removeAll(); - trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); - - /* - * extract the species the protein originates from - */ - trainSet.getTrainset()[i].setSpecies(getSpecies(trainmodel, trainSet.getTrainset()[i].getPdbID())); - - /* - * as we have sometimes to handle several amino acid chains we need the first - * amino acid of every chain, they are returned within a ResIterator - */ - ResIterator niter = getFirstAA(trainmodel); - - /* - * we add the information of which amino acid is the fourth predecessor of which other amino acid - */ - trainmodel = addDistanceInfo(trainmodel); - - /* - * take all amino acids which are in helices and put them into the - * global positives ArrayList, and all others in the global negatives ArrayList - */ - createPositivesAndNegatives(niter, trainmodel); - String rdfFile; - String arffFile; - if (trainSet.getTrainset()[i].getChainID().length() == 0) - { - rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".rdf"; - arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + ".arff"; - } - else - { - rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".rdf"; - arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." - + trainSet.getTrainset()[i].getChainID().toUpperCase() + ".arff"; - } - String pdbDir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; - File directory = new File(pdbDir); - directory.mkdir(); - String rdfFilePath = pdbDir + rdfFile; - String arffFilePath = pdbDir + arffFile; - - /* - * if arff = true create pdbID.arff files - */ - if (arff) - { - niter = getFirstAA(trainmodel); - createArffFile(arffFilePath, trainmodel, trainSet, niter); - } - - /* - * if rdfConf = true create pdbID.rdf and *.conf files - */ - if(rdfConf) - { - try - { - /* - * creatConfFile() - * writes the conf-Files and saves there File-objects in: - * confFileForAll and confFilePerResidue - */ - createConfFile(pdbDir, rdfFile, trainmodel); - PrintStream out = new PrintStream (new File(rdfFilePath)); - - // Output results - trainmodel.write(out, "RDF/XML"); - - // Important - free up resources used running the query - out.close(); - } - catch (IOException e) - { - System.err.println("OutputStream konnte nicht geschlossen werden!"); - } - } - /* - * remove all triples that contain information about begin and end of helices - */ - - Property beginsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, beginsAt); - Property endsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, endsAt); - Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); - trainmodel = removeStatementsWithObject(trainmodel, residue); - } - - /* - * write out the files that contain information about which proteins originate from which species - */ - HashMap<String, File> proteinsOfSpecies = new HashMap<String, File>(); - for (int i = 0; i < trainSet.getTrainset().length; i++){ - if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()) == null){ - File speciesProteins = new File(dataDir + trainSet.getTrainset()[i].getSpecies() + ".pos"); - proteinsOfSpecies.put(trainSet.getTrainset()[i].getSpecies(), speciesProteins); - } - if (proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()).canWrite()) { + if (rdfConf || arff) { + + PDBProtein protein = proteinSet.getProteinset().get(i); + String pdbDir = _dataDir + protein.getPdbID() + "/"; + File directory = new File(pdbDir); + if(! directory.exists()) directory.mkdir(); + // + //String arffFilePath = pdbDir + protein.getArffFileName(); + + _logger.info("PDB ID: " + protein.getPdbID()); + _logger.info("chain ID: " + protein.getChainID()); + trainmodel = new PDBIdRdfModel(protein); + + if (fasta){ + trainmodel.createFastaFile(pdbDir); + } + + + /* + * if arff = true create pdbID.arff files + */ + + /* + * as we have sometimes to handle several amino acid chains we need the first + * amino acid of every chain, they are returned within a ResIterator + */ + + if (arff) + { + ResIterator niter = trainmodel.getFirstAA(); + createArffFile(pdbDir, trainmodel, niter); + } + + /* + * remove all triples that contain information about begin and end of helices + */ + Property beginsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + trainmodel.removeStatementsWithPoperty(beginsAt); + Property endsAt = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + trainmodel.removeStatementsWithPoperty(endsAt); + Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); + trainmodel.removeStatementsWithObject(residue); + + /* + * we add the information which amino acid is the fourth predecessor of which other amino acid + */ + trainmodel.addDistanceInfo(); + + /* + * if rdfConf = true create pdbID.rdf and *.conf files + */ + + + + if(rdfConf) + { + String rdfFilePath = pdbDir + protein.getRdfFileName(); + try + { + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ + createConfFile(pdbDir, trainmodel); + + PrintStream out = new PrintStream (new File(rdfFilePath)); + + // Output results + trainmodel.getModel().write(out, "RDF/XML"); + + // Important - free up resources used running the query + out.close(); + } + catch (FileNotFoundException e) + { + _logger.error("File " + rdfFilePath + " konnte nicht gefunden werden!"); + e.printStackTrace(); + } + } + /* + * For every protein source species create a file that contains a list of all + * proteins that originate from that particular species. If it already exists + * we will append to it. + */ + File speciesProteins = new File(_dataDir + protein.getSpecies() + ".pos"); + try { - FileWriter out = new FileWriter(proteinsOfSpecies.get(trainSet.getTrainset()[i].getSpecies()), true); - String line = trainSet.getTrainset()[i].getPdbID() + "." - + trainSet.getTrainset()[i].getChainID() + ".\n"; + String line = protein.getPdbID() + "." + protein.getChainID() + "." + protein.getSpecies() + "\n"; + FileWriter out = new FileWriter(speciesProteins, true); + _logger.debug("Write " + line + "to file " + speciesProteins.getPath() + speciesProteins.getName()); out.write(line); out.close(); } catch (FileNotFoundException e) { - // TODO Auto-generated catch block + _logger.error("Could not find file " + speciesProteins.getPath() + speciesProteins.getName()); e.printStackTrace(); } catch (IOException e) { - // TODO Auto-generated catch block + _logger.error("Something went wrong while trying to write to " + speciesProteins.getPath() + speciesProteins.getName()); e.printStackTrace(); } - } } if(dlLearn) { - - - /* - * load RDF file and perform learn algorithm for every .conf-file - */ - /*ComponentManager cm = ComponentManager.getInstance(); - KnowledgeSource ks = cm.knowledgeSource(OWLFile.class); - cm.applyConfigEntry(ks, "url","file://" + filename ); - ReasonerComponent rc = cm.reasoner(FastInstanceChecker.class); - try { - rc.init(); - } catch (ComponentInitException e1) { - e1.printStackTrace(); - } - Start start = null; - Iterator<Resource> aa = confFilePerResidue.keySet().iterator(); - while ( aa.hasNext() ){ - Resource nextRes = aa.next(); - System.out.println(confFilePerResidue.get(nextRes).getAbsolutePath()); - try{ - start = new Start(confFilePerResidue.get(nextRes)); - } catch (ComponentInitException e) { - e.printStackTrace(); - } catch (FileNotFoundException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (org.dllearner.confparser.ParseException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - start.start(false); - Description d = start.getLearningAlgorithm().getCurrentlyBestDescription(); - System.out.println(d.toKBSyntaxString()); - } - */ + startDlLearner(); } if(wekaLearn) { + startWekaLearner(); } } - private static PdbRdfModel getRdfModelForIds(String pdbID ,String chainID) { - - /* - * i is an Iterator over an XML InputSource - */ - String[] pdbIDs = {pdbID}; - Pdb2RdfInputIterator i = new PdbsIterator(pdbIDs); - PdbXmlParser parser = new PdbXmlParser(); - PdbRdfModel allmodels = new PdbRdfModel(); - try { - while (i.hasNext()) - { - final InputSource input = i.next(); - PdbRdfModel model = parser.parse(input, new PdbRdfModel()); - /* - * jedes Model muss gleich nach den relevanten Daten durchsucht werden, - * da ansonsten Probleme mit der Speichergröße auftreten können. - */ - allmodels.add(getData(model, pdbID, chainID)); - } - } - catch (IOException e) - { - // TODO Auto-generated catch block - e.printStackTrace(); + private static void startDlLearner(){ + HashMap<String, File> pdbIDConfFile = loadConfFiles(_dir); + for (String pdbID : pdbIDConfFile.keySet()){ + try { + new PDBDLLearner(pdbIDConfFile.get(pdbID)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ComponentInitException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (LearningProblemUnsupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } - catch (SAXException e) - { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return allmodels; } - private static PdbRdfModel getData(PdbRdfModel model, String pdbID, String chainID) { - - // Beispiel einer SELECT Abfrage - /* String selectQuery = - * "SELECT { ?x1 ?x2 ?x3 .} " + - * "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> .}"; - * Query query = QueryFactory.create(selectQuery); - * QueryExecution qe = QueryExecutionFactory.create(query, model); - * ResultSet select = qe.execSelect(); - * ResultSetFormatter.out (System.out, select, query); - * - */ - - // CONSTRUCT Abfrage - - PdbRdfModel construct = new PdbRdfModel(); - /* - * i do it kind of difficult, but i want to be certain that i only get the sequences of - * Polypeptides(L) which contain at least one Helix. Furthermore i collect the information - * about at which position helices begin and end. - * NOTE: this information has to be removed before outputing the model. But i will use this - * to check for positive and negative train amino acids - */ - /* - * ich brauche noch die selektion der chain und die info über den genursprungsorganismus - * rdf:resource="http://bio2rdf.org/pdb:3LQH/chain_A" - * http://bio2rdf.org/pdb:3LQH/chain_A/position_1596 - */ - - String queryString = ""; - - if (chainID.length() != 1 || pdbID.length() != 4) - { - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 ." + - " ?xxx rdfs:label ?label .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?xxx rdfs:label ?label FILTER (str(?xxx) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + private static void startWekaLearner() { + HashMap<String, File> pdbIDArffFile = loadArffFiles(_dir); + for (String pdbID: pdbIDArffFile.keySet()){ + try { + new PDBWekaLearner(pdbIDArffFile.get(pdbID)); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } - else - { - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 ." + - " ?xxx rdfs:label ?label .} " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:hasChainPosition> ?x8 ." + - " ?x8 <http://purl.org/dc/terms/isPartOf> <http://bio2rdf.org/pdb:" + pdbID.toUpperCase() + - "/chain_" + chainID.toUpperCase() + "> ." + - " ?xxx rdfs:label ?label FILTER (str(?xxx) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . " + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; - } - - //System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - construct.add(qe.execConstruct()); - qe.close(); - return construct; - } - - private static String getSpecies( PdbRdfModel model, String pdbID) { - String queryString ; - queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "PREFIX dcterms: <http://purl.org/dc/terms/> " + - "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + - "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + - "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + - "SELECT ?species " + - "WHERE { ?x1 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x3 ." + - " ?x5 rdfs:label ?species FILTER (str(?x5) = fn:concat(str(?x4), '/extraction/source/gene/organism')) . }"; - - // System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - String species = ""; - try - { - ResultSet results = qe.execSelect() ; - for ( ; results.hasNext() ; ) - { - /* - * every entry in the ResultSet has the same value - */ - QuerySolution soln = results.nextSolution() ; - Literal l = soln.getLiteral("species") ; // Get a result variable - must be a literal - species = l.getString(); + } + private static HashMap<String,File> loadConfFiles (File dir){ + HashMap<String,File> confFiles = new HashMap<String,File>(); + _logger.info("Starting to load files in " + dir ); + File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); + for (File activeDirectory : pdbDir) { + File[] confFilesInActiveDirectory = activeDirectory.listFiles(new ConfFileFilter()); + _logger.info("Looking for Files in " + activeDirectory.getPath() ); + for (File confFile : confFilesInActiveDirectory) { + String confFileName = confFile.getName().substring(0, confFile.getName().indexOf(".conf")); + confFiles.put(confFileName, confFile); + _logger.info("Found .conf File " + confFile.getPath() ); } } - finally - { - qe.close() ; - } - return species; + return confFiles; } - - - private static ResIterator getFirstAA( PdbRdfModel model) { - PdbRdfModel construct = new PdbRdfModel(); - /* i look for all amino acids (AA) that have a successor - * but do not have a predecessor -> it's the first AA of every - * polypeptide chain - */ - - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 pdb:isImmediatelyBefore ?x2 . } " + - "WHERE { ?x1 pdb:isImmediatelyBefore ?x2 . " + - // NOT EXISTS can be used with SPARQL 1.1 - //"NOT EXISTS { ?x3 pdb:isImmediatelyBefore ?x1 . } }"; - " OPTIONAL { ?x3 pdb:isImmediatelyBefore ?x1 . } " + - " FILTER ( !BOUND(?x3) ) }"; - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - construct.add(qe.execConstruct()); - qe.close(); - ResIterator niter = construct.listSubjects(); - return niter; - } - private static PdbRdfModel addDistanceInfo(PdbRdfModel model){ - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 pdb:isFourAminoAcidsBefore ?x5 . } " + - "WHERE { ?x1 pdb:isImmediatelyBefore ?x2 . " + - " ?x2 pdb:isImmediatelyBefore ?x3 . " + - " ?x3 pdb:isImmediatelyBefore ?x4 . " + - " ?x4 pdb:isImmediatelyBefore ?x5 . }"; - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - model.add(qe.execConstruct()); - qe.close(); - return model; - } - - private static void createPositivesAndNegatives(ResIterator riter, PdbRdfModel model) { + private static HashMap<String,File> loadArffFiles (File dir){ + HashMap<String,File> arffFiles = new HashMap<String,File>(); + _logger.info("Starting to load files in " + dir ); - // Properties i have to check for while going through the AA-chain - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - ArrayList<Resource> pos = new ArrayList<Resource>(); - ArrayList<Resource> neg = new ArrayList<Resource>(); - - // every element in riter stands for a AA-chain start - // every first amino acid indicates a new AA-chain - while (riter.hasNext()) { - // Initialization of variables needed - Resource aaOne = riter.nextResource(); - Resource currentaa = aaOne; - Resource nextaa = aaOne; - boolean inHelix = false; - - // look if there is a next AA - do { - // looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property - currentaa = nextaa; - // die Guten ins Töpfchen ... - // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-chain - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ){ - inHelix = true; - } - // die Schlechten ins Kröpfchen - // if we get an non-empty iterator for pdb:endsAt and are already within a AA-chain - // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix){ - inHelix = false; - } - // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()){ - nextaa = model.getProperty(currentaa, iib).getResource(); - } - - // add current amino acid to positives or negatives set - if (inHelix){ - pos.add(currentaa); - } else { - neg.add(currentaa); - } - - } while (currentaa.hasProperty(iib)) ; + File[] pdbDir = dir.listFiles(new DirectoryFileFilter()); + for (File activeDirectory : pdbDir) { + File[] arffFilesInActDir = activeDirectory.listFiles(new ArffFileFilter()); + _logger.info("Looking for .arff Files in " + activeDirectory.getPath()); + for (File arffFile : arffFilesInActDir) { + String arffFileName = arffFile.getName().substring(0, arffFile.getName().indexOf(".arff")); + arffFiles.put(arffFileName, arffFile); + _logger.info("Found .arff File " + arffFile.getPath()); + } } - positives = pos; - negatives = neg; + return arffFiles; } - - - - private static PdbRdfModel removeStatementsWithPoperty(PdbRdfModel model, Property prop){ - - String queryString = - "PREFIX x:<" + prop.getNameSpace() + "> " + - "CONSTRUCT { ?x1 x:" + prop.getLocalName()+ " ?x2 . } " + - "WHERE { ?x1 x:" + prop.getLocalName() + " ?x2 . }"; - //System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - StmtIterator stmtiter = qe.execConstruct().listStatements(); - qe.close(); - while(stmtiter.hasNext()){ - model.remove(stmtiter.next()); - } - - return model; - } - - private static PdbRdfModel removeStatementsWithObject(PdbRdfModel model, Resource res){ - - String queryString = - "PREFIX x:<" + res.getNameSpace() + "> " + - "CONSTRUCT { ?x1 ?x2 x:" + res.getLocalName() + " . } " + - "WHERE { ?x1 ?x2 x:" + res.getLocalName() + " . }"; - // System.out.println(queryString); - Query query = QueryFactory.create(queryString); - QueryExecution qe = QueryExecutionFactory.create(query, model); - StmtIterator stmtiter = qe.execConstruct().listStatements(); - qe.close(); - while(stmtiter.hasNext()){ - model.remove(stmtiter.next()); - } - - return model; - } - - - private static void createConfFile(String dir, String rdffile, PdbRdfModel model){ + private static void createConfFile(String pdbDir, PDBIdRdfModel model){ try { + PDBProtein protein = model.getProtein(); // the file with all amino acids - String pdbname = dir + "pdb" + ".conf"; - confFileForAll = new File(pdbname); - PrintStream out = new PrintStream (confFileForAll); - // add import statements - out.println("import(\"AA_properties.owl\");"); - out.println("import(\"" + rdffile + "\");"); - out.println(); + String confFilePath = pdbDir + protein.getConfFileName(); + PrintStream confFile = new PrintStream (new File(confFilePath)); + // add import statements to confFile + String importStmt = new String("import(\"../AA_properties.owl\");\n" + + "import(\"" + protein.getRdfFileName() + "\");\n"); + confFile.println(importStmt); - HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); - resConfFiles.put(ala, new File(dir + ala.getLocalName() + ".conf")); - resConfFiles.put(cys, new File(dir + cys.getLocalName() + ".conf")); - resConfFiles.put(asp, new File(dir + asp.getLocalName() + ".conf")); - resConfFiles.put(glu, new File(dir + glu.getLocalName() + ".conf")); - resConfFiles.put(phe, new File(dir + phe.getLocalName() + ".conf")); - resConfFiles.put(gly, new File(dir + gly.getLocalName() + ".conf")); - resConfFiles.put(his, new File(dir + his.getLocalName() + ".conf")); - resConfFiles.put(ile, new File(dir + ile.getLocalName() + ".conf")); - resConfFiles.put(lys, new File(dir + lys.getLocalName() + ".conf")); - resConfFiles.put(leu, new File(dir + leu.getLocalName() + ".conf")); - resConfFiles.put(met, new File(dir + met.getLocalName() + ".conf")); - resConfFiles.put(asn, new File(dir + asn.getLocalName() + ".conf")); - resConfFiles.put(pro, new File(dir + pro.getLocalName() + ".conf")); - resConfFiles.put(gln, new File(dir + gln.getLocalName() + ".conf")); - resConfFiles.put(arg, new File(dir + arg.getLocalName() + ".conf")); - resConfFiles.put(ser, new File(dir + ser.getLocalName() + ".conf")); - resConfFiles.put(thr, new File(dir + thr.getLocalName() + ".conf")); - resConfFiles.put(val, new File(dir + val.getLocalName() + ".conf")); - resConfFiles.put(trp, new File(dir + trp.getLocalName() + ".conf")); - resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + ".conf")); - resConfFiles.put(sel, new File(dir + sel.getLocalName() + ".conf")); - resConfFiles.put(hyt, new File(dir + hyt.getLocalName() + ".conf")); - confFilePerResidue = resConfFiles; + HashMap<Resource, File> confFilePerResidue = AminoAcids.getAllConfFiles(pdbDir, protein.getConfFileName()); + + HashMap<Resource, PrintStream> resprint = AminoAcids.getAminoAcidPrintStreamMap(confFilePerResidue); - - - // put all amino acid resources and the their conf-files together - HashMap<Resource, PrintStream> resprint = new HashMap<Resource, PrintStream>(30); - resprint.put(ala, new PrintStream(resConfFiles.get(ala))); - resprint.put(cys, new PrintStream(resConfFiles.get(cys))); - resprint.put(asp, new PrintStream(resConfFiles.get(asp))); - resprint.put(glu, new PrintStream(resConfFiles.get(glu))); - resprint.put(phe, new PrintStream(resConfFiles.get(phe))); - resprint.put(gly, new PrintStream(resConfFiles.get(gly))); - resprint.put(his, new PrintStream(resConfFiles.get(his))); - resprint.put(ile, new PrintStream(resConfFiles.get(ile))); - resprint.put(lys, new PrintStream(resConfFiles.get(lys))); - resprint.put(leu, new PrintStream(resConfFiles.get(leu))); - resprint.put(met, new PrintStream(resConfFiles.get(met))); - resprint.put(asn, new PrintStream(resConfFiles.get(asn))); - resprint.put(pro, new PrintStream(resConfFiles.get(pro))); - resprint.put(gln, new PrintStream(resConfFiles.get(gln))); - resprint.put(arg, new PrintStream(resConfFiles.get(arg))); - resprint.put(ser, new PrintStream(resConfFiles.get(ser))); - resprint.put(thr, new PrintStream(resConfFiles.get(thr))); - resprint.put(val, new PrintStream(resConfFiles.get(val))); - resprint.put(trp, new PrintStream(resConfFiles.get(trp))); - resprint.put(tyr, new PrintStream(resConfFiles.get(tyr))); - resprint.put(sel, new PrintStream(resConfFiles.get(sel))); - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - // add import statements to .conf files for amino acids + // add import statements to <PDB ID>_<Amino Acid>.conf files Iterator<Resource> keys = resprint.keySet().iterator(); - while (keys.hasNext()){ - Resource k = keys.next(); - resprint.get(k).println("import(\"AA_properties.owl\");"); - resprint.get(k).println("import(\"" + rdffile + "\");"); - resprint.get(k).println(); + resprint.get(keys.next()).println(importStmt); } - - /* - * the for-loops beneath may cause trouble, if there exists an amino acid within a structure that - * doesn't exists in our HashMap - */ + + // add every amino acid in positive list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf + ArrayList<Resource> positives = model.getPositives(); for (int i = 0 ; i < positives.size() ; i++ ) { - out.println("+\"" + positives.get(i).getURI() + "\""); + confFile.println("+\"" + positives.get(i).getURI() + "\""); try{ - Statement spo = model.getProperty(positives.get(i), type); + Statement spo = model.getModel().getProperty(positives.get(i), type); resprint.get(spo.getResource()).println("+\"" + positives.get(i).getURI() + "\""); } catch (NullPointerException e) { // What was the Object that probably caused the pain? - System.err.println("Object probably not in our HashMap: " + - model.getProperty(positives.get(i), type).getResource()); - e.getStackTrace(); + _logger.error("Object probably not in our HashMap: " + + model.getModel().getProperty(positives.get(i), type).getResource()); + e.printStackTrace(); } - // System.out.println("Couldn't find AA: " + positives.get(i).getURI()); - } + // add every amino acid in negative list to <PDB ID>.conf and its corresponding <PDB ID>_<Amino Acid>.conf + ArrayList<Resource> negatives = model.getNegatives(); for (int i = 0 ; i < negatives.size() ; i++ ) { - out.println("-\"" + negatives.get(i).getURI() + "\""); + confFile.println("-\"" + negatives.get(i).getURI() + "\""); try{ - Statement spo = model.getProperty(negatives.get(i), type); + Statement spo = model.getModel().getProperty(negatives.get(i), type); resprint.get(spo.getResource()).println("-\"" + negatives.get(i).getURI() + "\""); } catch (NullPointerException e) { // What was the Object that probably caused the pain? - System.err.println("Object probably not in our HashMap: " + - model.getProperty(negatives.get(i), type).getResource()); - e.getStackTrace(); + _logger.error("Object probably not in our HashMap: " + + model.getModel().getProperty(negatives.get(i), type).getResource()); + e.printStackTrace(); } - - // System.out.println("Couldn't find AA: " + positives.get(i).getURI()); - } // Important - free up resources used running the query - out.close(); + confFile.close(); Iterator<Resource> newkeys = resprint.keySet().iterator(); while ( newkeys.hasNext() ){ @@ -805,143 +393,103 @@ } catch (IOException e) { - System.err.println("OutputStream konnte nicht geschlossen werden!"); + _logger.error("OutputStream konnte nicht geschlossen werden!"); } } - private static void createArffFile(String arffFilePath, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - - String relation = "@RELATION "; - for (int i = 0; i < sets.getTrainset().length ; i++){ - System.out.println("Element " + i + " = " + sets.getTrainset()[i].getPdbID()); - relation += sets.getTrainset()[i]; - } - - /* - * ATTRIBUTES - */ - - // Integer declaring Position in chain - String attrPosInChain = "@ATTRIBUTE position_in_chain NUMERIC\n"; - // Helix = 1 Other = 0 - String attrHelix = "@ATTRIBUTE in_helix NUMERIC\n"; - // Hydrophilic = 0 Hydrophobic = 1 Very_hydrophobic = 2 - String attrHydrophob = "@ATTRIBUTE hydrophob NUMERIC\n"; - // Negative = -1 Neutral = 0 Positive = 1 - String attrCharge = "@ATTRIBUTE charge NUMERIC\n"; - // Large = 2 Small = 1 Tiny = 0.5 - String attrSize = "@ATTRIBUTE size NUMERIC\n"; - // Aliphatic = 0 Aromatic = 1 - String attrAromaticity = "@ATTRIBUTE aromaticity NUMERIC\n"; - // Donor = 1 Donor/Acceptor = 0 Acceptor = -1 - String attrHydrogenbonding = "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; - - String attribute = attrPosInChain + attrHelix + attrHydrophob + - attrCharge + attrSize + attrAromaticity + attrHydrogenbonding + "\n"; - - String data = "@DATA\n"; - - HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); - resdata.put(ala, new String("2,0,0.5,?,?\n")); - resdata.put(cys, new String("1,0,1,?,0\n")); - resdata.put(asp, new String("0,-1,1,?,-1\n")); - resdata.put(glu, new String("0,-1,2,?,-1\n")); - resdata.put(phe, new String("2,0,2,1,?\n")); - resdata.put(gly, new String("2,0,0.5,?,?\n")); - resdata.put(his, new String("1,1,2,1,1\n")); - resdata.put(ile, new String("2,0,2,0,?\n")); - resdata.put(lys, new String("1,1,2,?,1\n")); - resdata.put(leu, new String("2,0,2,0,?\n")); - resdata.put(met, new String("2,0,2,?,?\n")); - resdata.put(asn, new String("0,0,1,?,0\n")); - resdata.put(pro, new String("?,0,1,?,?\n")); - resdata.put(gln, new String("0,0,2,?,0\n")); - resdata.put(arg, new String("0,1,2,?,1\n")); - resdata.put(ser, new String("0,0,0.5,?,0\n")); - resdata.put(thr, new String("1,0,1,?,0,\n")); - resdata.put(val, new String("2,0,1,0,?\n")); - resdata.put(trp, new String("1,0,2,1,1\n")); - resdata.put(tyr, new String("1,0,2,1,0\n")); - resdata.put(sel, new String("?,?,?,?,\n")); - - - // Properties i have to check for while going through the AA-chain - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - - // every element in riter stands for a AA-chain start - // every first amino acid indicates a new AA-chain - while (riter.hasNext()) - { - // Initialization of variables needed - int i = 0; - Resource aaOne = riter.nextResource(); - Resource currentaa = aaOne; - Resource nextaa = aaOne; - boolean inHelix = false; - System.out.println(currentaa.getURI()); - // look if there is a next AA - do { - ++i; - System.out.print(i + " "); - //looks weird, but is needed to enter loop even for the last AA which does not have a iib-Property - currentaa = nextaa; - NodeIterator resType = model.listObjectsOfProperty(currentaa,type); + private static void createArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + + try { + PDBProtein protein = model.getProtein(); + String arffFilePath = pdbDir + protein.getArffFileName(); + PrintStream out = new PrintStream (arffFilePath); + _logger.debug("Creating ARFF file: " + arffFilePath); + + /* + * RELATION + */ + String relation = "@RELATION " + protein.getPdbID(); + out.println(relation); + _logger.debug(relation); + + /* + * ATTRIBUTES + */ + // Integer declaring Position in chain + String attributes = "@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 + "@ATTRIBUTE charge NUMERIC\n" + // Negative = -1; Neutral = 0; Positive = 1 + "@ATTRIBUTE size NUMERIC\n" + // Large = 2; Small = 1; Tiny = 0.5 + "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 + "@ATTRIBUTE hydrogen_bonding NUMERIC\n"; // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 + + for (int i = -8; i < 8; i++) { + attributes += "@ATTRIBUTE aa_position_" + i + " CLASS\n"; // amino acid at position $i from current amino acid + } + attributes += "@ATTRIBUTE in_helix NUMERIC\n"; // Helix = 1 Other = 0 + + _logger.debug(attributes); + out.println(attributes); + + /* + * @DATA + */ + String data = "@DATA\n"; + _logger.debug(data); + out.println(data); + + // HashMap containing information about the properties of every amino acid + HashMap<Resource, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + ArrayList<Resource> positives = model.getPositives(); + ArrayList<Resource> negatives = model.getNegatives(); + Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); + Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); + + + while (firstAAs.hasNext()){ + Resource firstAA = firstAAs.next(); + Resource currentAA = firstAA; + Resource nextAA = firstAA; - // die Guten ins Töpfchen ... - // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-helix - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ) - { - inHelix = true; - } - // die Schlechten ins Kröpfchen - // if we get an non-empty iterator for pdb:endsAt and are already within a AA-helix - // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix) - { - inHelix = false; - } - // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()) - { - nextaa = model.getProperty(currentaa, iib).getResource(); - } - - // add current amino acid to positives or negatives set - while(resType.hasNext()) - { - Resource aaType = resType.next().asResource(); - System.out.println(aaType.getURI()); - if (resdata.get(aaType) != null) - { - if (inHelix) - { - data += i + "," + 1 + "," + resdata.get(aaType); + int i = 0; + String dataLine; + do { + dataLine = ""; + currentAA = nextAA; + + NodeIterator niter = model.getModel().listObjectsOfProperty(currentAA, type); + while (niter.hasNext()){ + Resource key = niter.next().asResource(); + if (resdata.containsKey(key)){ + dataLine += resdata.get(key) +","; } - else - { - data += i + "," + 0 + "," + resdata.get(aaType); + } + + for (int j = (i - 8... [truncated message content] |