From: <km...@us...> - 2011-05-12 06:51:52
|
Revision: 2791 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2791&view=rev Author: kmpf Date: 2011-05-12 06:51:43 +0000 (Thu, 12 May 2011) Log Message: ----------- Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java Added Paths: ----------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-11 12:44:11 UTC (rev 2790) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -71,9 +71,23 @@ private static ArrayList<Resource> positives; private static ArrayList<Resource> negatives; - + public void setPositives(ArrayList<Resource> pos){ + positives = pos; + } - private static String saveDir = "../test/pdb/"; + public void setNegatives(ArrayList<Resource> neg){ + negatives = neg; + } + + public ArrayList<Resource> getPositives(){ + return positives; + } + + public ArrayList<Resource> getNegatives(){ + return negatives; + } + + private static String dataDir = "../test/pdb/"; private static HashMap<Resource, File> confFilePerResidue; private static File confFileForAll; @@ -82,79 +96,133 @@ * TODO: remove beginsAt, endsAt from model */ public static void main(String[] args) { + Boolean test = true; + int dataSet = 1; + /* + * get dataset files + */ + String bt426 = dataDir + "bt426.list"; + File bt426List = new File(bt426); + String plp273 = dataDir + "plp273.list"; + File plp273List = new File(plp273); + String plp364 = dataDir + "plp364.list"; + File plp364List = new File(plp364); + String plp399 = dataDir + "plp399.list"; + File plp399List = new File(plp399); + + /* + * data for test purpose + */ + String pdbID = "3LQH"; + String chainID = ""; - TrainAndTestSet sets; + /* + * generate trainset and fill trainmodel + */ PdbRdfModel trainmodel = new PdbRdfModel(); - //do{ - String pdbID = "3LQH"; - sets = new TrainAndTestSet(pdbID); - trainmodel.add(getRdfModelForIds(sets.getTrainset())); - /* - * String[] id = {"200L"}; - * trainmodel.add(getRdfModelForIds(id)); - */ + TrainAndTestSet trainSet = new TrainAndTestSet(); + + if (test) + { + trainSet = new TrainAndTestSet(pdbID, chainID); + } + else + { + switch (dataSet) { + case 1: trainSet = new TrainAndTestSet(bt426List); break; + case 2: trainSet = new TrainAndTestSet(plp273List); break; + case 3: trainSet = new TrainAndTestSet(plp364List); break; + case 4: trainSet = new TrainAndTestSet(plp399List); break; + } + } - // PdbRdfModel testmodel = getRdfModelForIds(sets.getTestset()); + + /* + * generate a PdbRdfModel for every pdbID + */ + + for (int i = 0; i < trainSet.getTrainset().length; i++) + { + String[] pdbIDs = {trainSet.getTrainset()[i].getPdbID()}; + trainmodel.removeAll(); + trainmodel.add(getRdfModelForIds(trainSet.getTrainset()[i].getPdbID(), trainSet.getTrainset()[i].getChainID())); + /* - * as we have to handle several amino acid chains we need the first + * as we have sometimes to handle several amino acid chains we need the first * amino acid of every chain, they are returned within a ResIterator */ ResIterator niter = getFirstAA(trainmodel); - ResIterator riter = getFirstAA(trainmodel); + ResIterator riter = niter; /* * we add some distance Information to our model */ trainmodel = addDistanceInfo(trainmodel); - + /* * take all amino acids which are in helices and put them into the * global positives ArrayList, and all others in the global negatives ArrayList */ createPositivesAndNegatives(niter, trainmodel); - //} while(positives.size() > 100 && negatives.size() > 100); - - SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH_mm"); - String date = df.format(new Date()); - String rdffile = "Helixtrainer" + date + ".rdf"; - String arfffile = "Helixtrainer" + date + ".arff"; - String filename = saveDir + rdffile; - - createArffFile(date, arfffile, trainmodel, sets, riter); - /* - * remove all triples that contain information about begin and end of helices - */ - Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ba); - Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); - trainmodel = removeStatementsWithPoperty(trainmodel, ea); - Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); - trainmodel = removeStatementsWithObject(trainmodel, residue); - - - try - { + SimpleDateFormat df = new SimpleDateFormat("_yyyy_MM_dd_HH_mm"); + String date = df.format(new Date()); + String rdfFile; + String arffFile; + if (trainSet.getTrainset()[i].getChainID().length() == 0) + { + rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".rdf"; + arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + date + ".arff"; + } + else + { + rdfFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." + + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".rdf"; + arffFile = trainSet.getTrainset()[i].getPdbID().toUpperCase() + "." + + trainSet.getTrainset()[i].getChainID().toUpperCase() + date + ".arff"; + } + String dir = dataDir + trainSet.getTrainset()[i].getPdbID() + "/"; + File directory = new File(dir); + directory.mkdir(); + String rdfFilePath = dir + rdfFile; + String arffFilePath = dir + arffFile; + + createArffFile(arffFilePath, trainmodel, trainSet, riter); /* - * creatConfFile() - * writes the conf-Files and saves there File-objects in: - * confFileForAll and confFilePerResidue + * remove all triples that contain information about begin and end of helices */ - createConfFile(date, rdffile, trainmodel); - PrintStream out = new PrintStream (new File(filename)); + Property ba = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "beginsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, ba); + Property ea = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "endsAt"); + trainmodel = removeStatementsWithPoperty(trainmodel, ea); + Resource residue = ResourceFactory.createResource("http://bio2rdf.org/pdb:Residue"); + trainmodel = removeStatementsWithObject(trainmodel, residue); - // Output results - trainmodel.write(out, "RDF/XML"); - // Important - free up resources used running the query - out.close(); - } - catch (IOException e) - { - System.err.println("OutputStream konnte nicht geschlossen werden!"); - } - + try + { + /* + * creatConfFile() + * writes the conf-Files and saves there File-objects in: + * confFileForAll and confFilePerResidue + */ + createConfFile(dir, date, rdfFile, trainmodel); + PrintStream out = new PrintStream (new File(rdfFilePath)); + + // Output results + trainmodel.write(out, "RDF/XML"); + + // Important - free up resources used running the query + out.close(); + } + catch (IOException e) + { + System.err.println("OutputStream konnte nicht geschlossen werden!"); + } + + } + /* * load RDF file and perform learn algorithm for every .conf-file */ @@ -191,34 +259,41 @@ */ } - private static PdbRdfModel getRdfModelForIds(String[] pdbIDs) { + private static PdbRdfModel getRdfModelForIds(String pdbID ,String chainID) { - // i is an Iterator over an XML InputSource + /* + * i is an Iterator over an XML InputSource + */ + String[] pdbIDs = {pdbID}; Pdb2RdfInputIterator i = new PdbsIterator(pdbIDs); PdbXmlParser parser = new PdbXmlParser(); PdbRdfModel allmodels = new PdbRdfModel(); - try { - - while (i.hasNext()) { + while (i.hasNext()) + { final InputSource input = i.next(); PdbRdfModel model = parser.parse(input, new PdbRdfModel()); - // jedes Model muss gleich nach den relevanten Daten durchsucht werden, - // da ansonsten Probleme mit der Speichergröße auftreten können. - allmodels.add(getData(model)); - - } - } catch (IOException e) { + /* + * jedes Model muss gleich nach den relevanten Daten durchsucht werden, + * da ansonsten Probleme mit der Speichergröße auftreten können. + */ + allmodels.add(getData(model, pdbID, chainID)); + } + } + catch (IOException e) + { // TODO Auto-generated catch block e.printStackTrace(); - } catch (SAXException e) { + } + catch (SAXException e) + { // TODO Auto-generated catch block e.printStackTrace(); } return allmodels; } - private static PdbRdfModel getData(PdbRdfModel model) { + private static PdbRdfModel getData(PdbRdfModel model, String pdbID, String chainID) { // Beispiel einer SELECT Abfrage /* String selectQuery = @@ -234,30 +309,70 @@ // CONSTRUCT Abfrage PdbRdfModel construct = new PdbRdfModel(); - /* i do it kind of difficult, but i want to be certain that i only get the sequences of + /* + * i do it kind of difficult, but i want to be certain that i only get the sequences of * Polypeptides(L) which contain at least one Helix. Furthermore i collect the information * about at which position helices begin and end. - * NOTE: this information has to be removed before oututing the model. But i will use this + * NOTE: this information has to be removed before outputing the model. But i will use this * to check for positive and negative train amino acids */ - String queryString = - "PREFIX pdb: <http://bio2rdf.org/pdb:> " + - "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + - "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + - " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + - " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + - " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + - " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + - " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + - // with the Optional clause i get the information by which amino acid - // a amino acid is followed - " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + /* + * ich brauche noch die selektion der chain und die info über den genursprungsorganismus + * rdf:resource="http://bio2rdf.org/pdb:3LQH/chain_A" + * http://bio2rdf.org/pdb:3LQH/chain_A/position_1596 + */ + + String queryString = ""; + + if (chainID.length() != 1 || pdbID.length() != 4) + { + queryString = + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "PREFIX dcterms: <http://purl.org/dc/terms/> " + + "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + + "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + + "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + + " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + + " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + // with the Optional clause i get the information by which amino acid + // a amino acid is followed + " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + } + else + { + queryString = + "PREFIX pdb: <http://bio2rdf.org/pdb:> " + + "PREFIX dcterms: <http://purl.org/dc/terms/> " + + "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + + "CONSTRUCT { ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 . " + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 . " + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } " + + "WHERE { ?x1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Helix> ." + + " ?x1 <http://bio2rdf.org/pdb:beginsAt> ?x2 ." + + " ?x1 <http://bio2rdf.org/pdb:endsAt> ?x3 ." + + " ?x3 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://bio2rdf.org/pdb:Polypeptide(L)> ." + + " ?x5 <http://purl.org/dc/terms/isPartOf> ?x4 ." + + " ?x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?x6 ." + + " ?x5 <http://bio2rdf.org/pdb:hasChainPosition> ?x8 ." + + " ?x8 <http://purl.org/dc/terms/isPartOf> <http://bio2rdf.org/pdb:" + pdbID.toUpperCase() + + "/chain_" + chainID.toUpperCase() + "> ." + + // with the Optional clause i get the information by which amino acid + // a amino acid is followed + " OPTIONAL { ?x5 <http://bio2rdf.org/pdb:isImmediatelyBefore> ?x7 . } .}"; + } + System.out.println(queryString); Query query = QueryFactory.create(queryString); QueryExecution qe = QueryExecutionFactory.create(query, model); construct.add(qe.execConstruct()); @@ -394,11 +509,11 @@ } - private static void createConfFile(String date, String rdffile, PdbRdfModel model){ + private static void createConfFile(String dir, String date, String rdffile, PdbRdfModel model){ try { // the file with all amino acids - String pdbname = saveDir + "pdb" + date + ".conf"; + String pdbname = dir + "pdb" + date + ".conf"; confFileForAll = new File(pdbname); PrintStream out = new PrintStream (confFileForAll); // add import statements @@ -407,27 +522,27 @@ out.println(); HashMap<Resource, File> resConfFiles = new HashMap<Resource, File>(30); - resConfFiles.put(ala, new File(saveDir + ala.getLocalName() + date + ".conf")); - resConfFiles.put(cys, new File(saveDir + cys.getLocalName() + date + ".conf")); - resConfFiles.put(asp, new File(saveDir + asp.getLocalName() + date + ".conf")); - resConfFiles.put(glu, new File(saveDir + glu.getLocalName() + date + ".conf")); - resConfFiles.put(phe, new File(saveDir + phe.getLocalName() + date + ".conf")); - resConfFiles.put(gly, new File(saveDir + gly.getLocalName() + date + ".conf")); - resConfFiles.put(his, new File(saveDir + his.getLocalName() + date + ".conf")); - resConfFiles.put(ile, new File(saveDir + ile.getLocalName() + date + ".conf")); - resConfFiles.put(lys, new File(saveDir + lys.getLocalName() + date + ".conf")); - resConfFiles.put(leu, new File(saveDir + leu.getLocalName() + date + ".conf")); - resConfFiles.put(met, new File(saveDir + met.getLocalName() + date + ".conf")); - resConfFiles.put(asn, new File(saveDir + asn.getLocalName() + date + ".conf")); - resConfFiles.put(pro, new File(saveDir + pro.getLocalName() + date + ".conf")); - resConfFiles.put(gln, new File(saveDir + gln.getLocalName() + date + ".conf")); - resConfFiles.put(arg, new File(saveDir + arg.getLocalName() + date + ".conf")); - resConfFiles.put(ser, new File(saveDir + ser.getLocalName() + date + ".conf")); - resConfFiles.put(thr, new File(saveDir + thr.getLocalName() + date + ".conf")); - resConfFiles.put(val, new File(saveDir + val.getLocalName() + date + ".conf")); - resConfFiles.put(trp, new File(saveDir + trp.getLocalName() + date + ".conf")); - resConfFiles.put(tyr, new File(saveDir + tyr.getLocalName() + date + ".conf")); - resConfFiles.put(sel, new File(saveDir + sel.getLocalName() + date + ".conf")); + resConfFiles.put(ala, new File(dir + ala.getLocalName() + date + ".conf")); + resConfFiles.put(cys, new File(dir + cys.getLocalName() + date + ".conf")); + resConfFiles.put(asp, new File(dir + asp.getLocalName() + date + ".conf")); + resConfFiles.put(glu, new File(dir + glu.getLocalName() + date + ".conf")); + resConfFiles.put(phe, new File(dir + phe.getLocalName() + date + ".conf")); + resConfFiles.put(gly, new File(dir + gly.getLocalName() + date + ".conf")); + resConfFiles.put(his, new File(dir + his.getLocalName() + date + ".conf")); + resConfFiles.put(ile, new File(dir + ile.getLocalName() + date + ".conf")); + resConfFiles.put(lys, new File(dir + lys.getLocalName() + date + ".conf")); + resConfFiles.put(leu, new File(dir + leu.getLocalName() + date + ".conf")); + resConfFiles.put(met, new File(dir + met.getLocalName() + date + ".conf")); + resConfFiles.put(asn, new File(dir + asn.getLocalName() + date + ".conf")); + resConfFiles.put(pro, new File(dir + pro.getLocalName() + date + ".conf")); + resConfFiles.put(gln, new File(dir + gln.getLocalName() + date + ".conf")); + resConfFiles.put(arg, new File(dir + arg.getLocalName() + date + ".conf")); + resConfFiles.put(ser, new File(dir + ser.getLocalName() + date + ".conf")); + resConfFiles.put(thr, new File(dir + thr.getLocalName() + date + ".conf")); + resConfFiles.put(val, new File(dir + val.getLocalName() + date + ".conf")); + resConfFiles.put(trp, new File(dir + trp.getLocalName() + date + ".conf")); + resConfFiles.put(tyr, new File(dir + tyr.getLocalName() + date + ".conf")); + resConfFiles.put(sel, new File(dir + sel.getLocalName() + date + ".conf")); confFilePerResidue = resConfFiles; @@ -518,19 +633,16 @@ } } - private static void createArffFile(String date, String arfffile, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - String arffname = saveDir + "pdb" + date + ".arff"; + private static void createArffFile(String arffFilePath, PdbRdfModel model, TrainAndTestSet sets, ResIterator riter){ - - String relation = "@RELATION "; for (int i = 0; i < sets.getTrainset().length ; i++){ - System.out.println("Element " + i + "= " + sets.getTrainset()[i]); + System.out.println("Element " + i + " = " + sets.getTrainset()[i].getPdbID()); relation += sets.getTrainset()[i]; } /* - * ATTRIBUTEs + * ATTRIBUTES */ // Integer declaring Position in chain @@ -586,7 +698,8 @@ // every element in riter stands for a AA-chain start // every first amino acid indicates a new AA-chain - while (riter.hasNext()) { + while (riter.hasNext()) + { // Initialization of variables needed int i = 0; Resource aaOne = riter.nextResource(); @@ -604,28 +717,36 @@ // die Guten ins Töpfchen ... // if we get an non-empty iterator for pdb:beginsAt the next AAs are within a AA-helix - if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ){ + if(model.listResourcesWithProperty(ba, currentaa).hasNext() && !inHelix ) + { inHelix = true; } // die Schlechten ins Kröpfchen // if we get an non-empty iterator for pdb:endsAt and are already within a AA-helix // the AAs AFTER the current ones aren't within a helix - if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix){ + if (model.listResourcesWithProperty(ea, currentaa).hasNext() && inHelix) + { inHelix = false; } // get next AA if there is one - if (model.listObjectsOfProperty(currentaa, iib).hasNext()){ + if (model.listObjectsOfProperty(currentaa, iib).hasNext()) + { nextaa = model.getProperty(currentaa, iib).getResource(); } // add current amino acid to positives or negatives set - while(resType.hasNext()){ + while(resType.hasNext()) + { Resource aaType = resType.next().asResource(); System.out.println(aaType.getURI()); - if (resdata.get(aaType) != null){ - if (inHelix){ + if (resdata.get(aaType) != null) + { + if (inHelix) + { data += i + "," + 1 + "," + resdata.get(aaType); - } else { + } + else + { data += i + "," + 0 + "," + resdata.get(aaType); } } @@ -634,19 +755,18 @@ } while (currentaa.hasProperty(iib)) ; } - try{ - PrintStream out = new PrintStream (new File(arffname)); + try + { + PrintStream out = new PrintStream (new File(arffFilePath)); out.println(relation); out.print(attribute); out.print(data); out.close(); - - } catch (FileNotFoundException e ) { - System.err.println("Datei " + arffname + "konnte nicht angelegt werden!"); + } + catch (FileNotFoundException e ) + { + System.err.println("Datei " + arffFilePath + " konnte nicht angelegt werden!"); e.printStackTrace(); } - - - } -} \ No newline at end of file +} Added: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java (rev 0) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PdbProtein.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -0,0 +1,43 @@ +package org.dllearner.examples.pdb; + +public class PdbProtein { + + private String pdbID; + private String chainID; + private String species; + + public PdbProtein(String pdbID) { + this.pdbID = pdbID; + } + + public PdbProtein(String pdbID, String chainID) { + this.pdbID = pdbID; + this.chainID = chainID; + } + public PdbProtein() { + this.pdbID = ""; + this.chainID = ""; + } + + public String getPdbID() { + return pdbID; + } + public void setPdbID(String pdbID) { + this.pdbID = pdbID; + } + public String getChainID() { + return chainID; + } + public void setChainID(String chain) { + this.chainID = chain; + } + public String getSpecies() { + return species; + } + public void setSpecies(String species) { + this.species = species; + } + + + +} Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-11 12:44:11 UTC (rev 2790) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/TrainAndTestSet.java 2011-05-12 06:51:43 UTC (rev 2791) @@ -1,5 +1,7 @@ package org.dllearner.examples.pdb; +import java.io.File; +import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; @@ -11,29 +13,40 @@ public class TrainAndTestSet { - private String[] trainset; - private String[] testset; - private HashMap<Integer,String> setentries; - private ArrayList<String> pdbprotlines; + private PdbProtein[] trainset; - public String[] getTrainset() { + public PdbProtein[] getTrainset() { return trainset; } - - public String[] getTestset() { - return testset; + + + public TrainAndTestSet () { + String pdbID = ""; + PdbProtein[] pdbProteins = {new PdbProtein(pdbID)}; + this.trainset = pdbProteins; } public TrainAndTestSet (String pdbID) { - String[] pdbIDs = {pdbID}; - this.trainset = pdbIDs; + PdbProtein[] pdbProteins = {new PdbProtein(pdbID)}; + this.trainset = pdbProteins; } + public TrainAndTestSet (String pdbID, String chainID) { + PdbProtein[] pdbProteins = {new PdbProtein(pdbID, chainID)}; + this.trainset = pdbProteins; + } + public TrainAndTestSet (String[] pdbIDs) { - this.trainset = pdbIDs; + PdbProtein pdbProt; + PdbProtein[] pdbProteins = new PdbProtein[pdbIDs.length]; + for (int i = 0; i < pdbIDs.length; i++ ) + { + pdbProt = new PdbProtein(pdbIDs[i]); + pdbProteins[i] = pdbProt; + } + this.trainset = pdbProteins; } - public TrainAndTestSet (int setsize) { // we read in the online file with all PDB-entries @@ -41,14 +54,12 @@ try { pdbEntryType = new URL("ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_entry_type.txt"); LineNumberReader pdbproteins = new LineNumberReader(new InputStreamReader(pdbEntryType.openStream())); - - // we calculate the number of lines in that file and - // read all lines into the global variable pdbprotlines - int linenr = this.getNumberOfLines(pdbproteins); + // read all lines in lines + ArrayList<String> lines = this.readInFile(pdbproteins); pdbproteins.close(); - - - // System.out.println("PDB Prot File has "+linenr+" lines." ); + // get number of lines + int linenr = lines.size(); + System.out.println("PDB Prot File has "+linenr+" lines." ); // handling of incorrect setsize values if ((2*setsize) >= linenr) { @@ -59,9 +70,7 @@ } // lets create Train- and Testset - this.trainset = this.createSet(setsize, linenr); - this.testset = this.createSet(setsize, linenr); - + this.trainset = this.createSet(setsize, linenr, lines); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -71,77 +80,84 @@ } } - // this method counts the number of lines in the read in file and - // fills pdbprotlines with content - private int getNumberOfLines (LineNumberReader lnr) { - try { - int count = 0; - ArrayList<String> arraylist = new ArrayList<String>(); - String line; - - - while ((line = lnr.readLine()) != null) { - arraylist.add(count, line); - count++; + + public TrainAndTestSet (File pdbIDlist) { + try + { + LineNumberReader pdbproteins = new LineNumberReader(new FileReader(pdbIDlist)); + ArrayList<String> lines = this.readInFile(pdbproteins); + pdbproteins.close(); + // get number of lines + int linenr = lines.size(); + PdbProtein[] proteins = new PdbProtein[linenr]; + for (int i = 0; i < linenr; i++) + { + proteins[i].setPdbID(getpdbid(i, lines)); + proteins[i].setChainID(getChainID(i, lines)); } - this.pdbprotlines = arraylist; - - return count; + this.trainset = proteins; } - catch (IOException e) { + catch (IOException e) + { // TODO Auto-generated catch block e.printStackTrace(); - return 0; } + } -/* private void createArrayList(int linenumber){ + private ArrayList<String> readInFile (LineNumberReader lnr) { + ArrayList<String> arraylist = new ArrayList<String>(); try { - ArrayList<String> arraylist = new ArrayList<String>(); - LineNumberReader lnr = new LineNumberReader(new FileReader(this.pdbproteins)); - for (int i = 0; i < linenumber; i++) { - String line = lnr.readLine(); - arraylist.add(i, line); - // System.out.println("Line "+ i +": "+ line); + String line; + while ((line = lnr.readLine()) != null) + { + arraylist.add(line); } - this.pdbprotlines = arraylist; - } catch (FileNotFoundException e) { + } + catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); } - + return arraylist; } - */ //creates Sets of PDB IDs equal to setsize - private String [] createSet(int setsize, int linenr){ - String [] set = new String [setsize]; - if (this.setentries == null) { - this.setentries = new HashMap<Integer,String>(2*setsize); - } - HashMap<Integer,String> setmap = this.setentries; + private PdbProtein[] createSet(int setsize, int linenr, ArrayList<String> lines){ + + PdbProtein[] set = new PdbProtein[setsize]; + HashMap<Integer,String> setmap = new HashMap<Integer,String>(2*setsize); + Random gen = new Random(); for (int i = 0; i < setsize; i++) { int lnr = gen.nextInt(linenr); while (setmap.containsKey(Integer.valueOf(lnr))) { lnr = gen.nextInt(linenr); } - set[i] = this.getpdbid(lnr); - setmap.put(Integer.valueOf(lnr), set[i]); + set[i].setPdbID(this.getpdbid(lnr, lines)); + setmap.put(Integer.valueOf(lnr), set[i].getPdbID()); } - this.setentries = setmap; return set; } - private String getpdbid (int lineNumber) { + private String getpdbid (int lineNumber, ArrayList<String> lines ) { // Initialize a LineNumberReader - ArrayList<String> arraylist = pdbprotlines; - String line =(String) arraylist.get(lineNumber); + String line =(String) lines.get(lineNumber); String pdb_id = line.substring(0, 4); return pdb_id; } + + private String getChainID (int lineNumber, ArrayList<String> lines) { + String line =(String) lines.get(lineNumber); + String chainID; + if (line.length() > 4) + { + chainID = line.substring(5, 7); + } + else + { + chainID = ""; + } + return chainID; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |