From: <km...@us...> - 2011-11-23 18:49:18
|
Revision: 3433 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3433&view=rev Author: kmpf Date: 2011-11-23 18:49:11 +0000 (Wed, 23 Nov 2011) Log Message: ----------- Amino Acid Properties changed Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -117,32 +117,122 @@ return resourceString; } - public static HashMap<String, String> getAminoAcidArffAttributeMap(){ +/* + ++++ Amino acid names and numbers ++++ +Every line starts with the one-letter-code, followed by their numeric representation for .arff files, +followed by their three-letter-code and finally their name. + A = 1 Ala Alanin + C = 3 Cys Cystein + D = 4 Asp Aspartat + E = 5 Glu Glutamat + F = 6 Phe Phenylalanin + G = 7 Gly Glycin + H = 8 His Histidin + I = 9 Ile Isoleucin + K = 11 Lys Lysin + L = 12 Leu Leucin + M = 13 Met Methionin + N = 14 Asn Asparagin + O = 15 Pyl Pyrrolysin + P = 16 Pro Prolin + Q = 17 Gln Glutamin + R = 18 Arg Arginin + S = 19 Ser Serin + T = 20 Thr Threonin + U = 21 Sec Selenocystein + V = 22 Val Valin + W = 23 Trp Tryptophan + Y = 25 Tyr Tyrosin +*/ + public static HashMap<String, String> getAminoAcidNumber(){ + HashMap<String,String> resnum = new HashMap<String, String>(30); + resnum.put(new String("A"), new String("1")); + resnum.put(new String("C"), new String("3")); + resnum.put(new String("D"), new String("4")); + resnum.put(new String("E"), new String("5")); + resnum.put(new String("F"), new String("6")); + resnum.put(new String("G"), new String("7")); + resnum.put(new String("H"), new String("8")); + resnum.put(new String("I"), new String("9")); + resnum.put(new String("K"), new String("11")); + resnum.put(new String("L"), new String("12")); + resnum.put(new String("M"), new String("13")); + resnum.put(new String("N"), new String("14")); + resnum.put(new String("O"), new String("15")); + resnum.put(new String("P"), new String("16")); + resnum.put(new String("Q"), new String("17")); + resnum.put(new String("R"), new String("18")); + resnum.put(new String("S"), new String("19")); + resnum.put(new String("T"), new String("20")); + resnum.put(new String("U"), new String("21")); + resnum.put(new String("V"), new String("22")); + resnum.put(new String("W"), new String("23")); + resnum.put(new String("Y"), new String("25")); + + return resnum; + } + + public static HashMap<String, String> getAminoAcidNumericArffAttributeMap(){ + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + // Polarity unpolar = 0; polar = 1; positive = 2; negative = 3; + // Size Tiny = 0; Small = 1; Large = 2; HashMap<String, String> resdata = new HashMap<String, String>(30); - resdata.put(new String("A"), new String("2,0,0.5,?,?")); - resdata.put(new String("C"), new String("1,0,1,?,0")); - resdata.put(new String("D"), new String("0,-1,1,?,-1")); - resdata.put(new String("E"), new String("0,-1,2,?,-1")); - resdata.put(new String("F"), new String("2,0,2,1,?")); - resdata.put(new String("G"), new String("2,0,0.5,?,?")); - resdata.put(new String("H"), new String("1,1,2,1,1")); - resdata.put(new String("I"), new String("2,0,2,0,?")); - resdata.put(new String("K"), new String("1,1,2,?,1")); - resdata.put(new String("L"), new String("2,0,2,0,?")); - resdata.put(new String("M"), new String("2,0,2,?,?")); - resdata.put(new String("N"), new String("0,0,1,?,0")); - resdata.put(new String("P"), new String("?,0,1,?,?")); - resdata.put(new String("Q"), new String("0,0,2,?,0")); - resdata.put(new String("R"), new String("0,1,2,?,1")); - resdata.put(new String("S"), new String("0,0,0.5,?,0")); - resdata.put(new String("T"), new String("1,0,1,?,0,")); - resdata.put(new String("V"), new String("2,0,1,0,?")); - resdata.put(new String("W"), new String("1,0,2,1,1")); - resdata.put(new String("Y"), new String("1,0,2,1,0")); - resdata.put(new String("U"), new String("?,?,?,?,?")); + resdata.put(new String("A"), new String("1,0,0")); + resdata.put(new String("C"), new String("1,1,0")); + resdata.put(new String("D"), new String("0,3,1")); + resdata.put(new String("E"), new String("0,3,2")); + resdata.put(new String("F"), new String("2,0,2")); + resdata.put(new String("G"), new String("1,0,0")); + resdata.put(new String("H"), new String("2,2,2")); + resdata.put(new String("I"), new String("3,0,2")); + resdata.put(new String("K"), new String("1,2,2")); + resdata.put(new String("L"), new String("3,0,2")); + resdata.put(new String("M"), new String("1,0,2")); + resdata.put(new String("N"), new String("0,1,1")); + resdata.put(new String("O"), new String("?,?,?")); + resdata.put(new String("P"), new String("0,0,1")); + resdata.put(new String("Q"), new String("0,1,2")); + resdata.put(new String("R"), new String("0,2,2")); + resdata.put(new String("S"), new String("0,1,0")); + resdata.put(new String("T"), new String("1,1,1")); + resdata.put(new String("V"), new String("3,0,1")); + resdata.put(new String("W"), new String("2,1,2")); + resdata.put(new String("X"), new String("?,?,?")); // unknown residue (e.g. modified amino acids) + resdata.put(new String("Y"), new String("2,1,2")); + resdata.put(new String("U"), new String("?,?,?")); + return resdata; + } + + public static HashMap<String, String> getAminoAcidNominalArffAttributeMap(){ + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + // Polarity unpolar = 0 polar = 1; positive = 2; negative = 3; + // Size Tiny = 0; Small = 1; Large = 2; + HashMap<String, String> resdata = new HashMap<String, String>(30); + + resdata.put(new String("A"), new String("Hydrophobic,Unpolar,Tiny")); + resdata.put(new String("C"), new String("Hydrophobic,Polar,Tiny")); + resdata.put(new String("D"), new String("Hydrophilic,Negative,Small")); + resdata.put(new String("E"), new String("Hydrophilic,Negative,Large")); + resdata.put(new String("F"), new String("Aromatic,Unpolar,Large")); + resdata.put(new String("G"), new String("Hydrophobic,Unpolar,Tiny")); + resdata.put(new String("H"), new String("Aromatic,Positive,Large")); + resdata.put(new String("I"), new String("Aliphatic,Unpolar,Large")); + resdata.put(new String("K"), new String("Hydrophobic,Positive,Large")); + resdata.put(new String("L"), new String("Aliphatic,Unpolar,Large")); + resdata.put(new String("M"), new String("Hydrophobic,Unpolar,Large")); + resdata.put(new String("N"), new String("Hydrophilic,Polar,Small")); + resdata.put(new String("O"), new String("?,?,?")); + resdata.put(new String("P"), new String("Hydrophilic,Unpolar,Small")); + resdata.put(new String("Q"), new String("Hydrophilic,Polar,Large")); + resdata.put(new String("R"), new String("Hydrophilic,Positive,Large")); + resdata.put(new String("S"), new String("Hydrophilic,Polar,Tiny")); + resdata.put(new String("T"), new String("Hydrophobic,Polar,Small")); + resdata.put(new String("V"), new String("Aliphatic,Unpolar,Small")); + resdata.put(new String("W"), new String("Aromatic,Polar,Large")); + resdata.put(new String("X"), new String("?,?,?")); // unknown residue (e.g. modified amino acids) + resdata.put(new String("Y"), new String("Aromatic,Polar,Large")); + resdata.put(new String("U"), new String("?,?,?")); return resdata; } - - } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -20,8 +20,6 @@ import org.dllearner.core.ComponentInitException; import org.dllearner.core.LearningProblemUnsupportedException; - -import com.hp.hpl.jena.rdf.model.NodeIterator; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; @@ -176,7 +174,8 @@ if (arff) { ResIterator niter = trainmodel.getFirstAA(); - createArffFile(pdbDir, trainmodel, niter); + createNumericArffFile(pdbDir, trainmodel, niter); + createNominalArffFile(pdbDir, trainmodel, niter); } /* @@ -361,7 +360,7 @@ // add knowledge source definition to <PDB ID>.conf files confFile.println(ks); - + // add knowledge source definition to <PDB ID>_<Amino Acid>.conf files Iterator<Resource> resources = resprint.keySet().iterator(); while (resources.hasNext()){ @@ -451,13 +450,14 @@ } } - private static void createArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + private static void createNumericArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ try { PDBProtein protein = model.getProtein(); String arffFilePath = pdbDir + protein.getArffFileName(); + arffFilePath = arffFilePath.replace(".arff", ".numeric.arff"); PrintStream out = new PrintStream (arffFilePath); - _logger.debug("Creating ARFF file: " + arffFilePath); + _logger.debug("Creating numeric ARFF file: " + arffFilePath); /* * RELATION @@ -470,14 +470,16 @@ * ATTRIBUTES */ // Integer declaring Position in chain - StringBuffer attributes = new StringBuffer("@ATTRIBUTE hydrophob NUMERIC\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 - "@ATTRIBUTE charge NUMERIC\n" + // Negative = -1; Neutral = 0; Positive = 1 - "@ATTRIBUTE size NUMERIC\n" + // Large = 2; Small = 1; Tiny = 0.5 - "@ATTRIBUTE aromaticity NUMERIC\n" + // Aliphatic = 0; Aromatic = 1 - "@ATTRIBUTE hydrogen_bonding NUMERIC\n"); // Donor = 1; Donor/Acceptor = 0; Acceptor = -1 + StringBuffer attributes = new StringBuffer( + // Hydrophobicity hydrophilic = 0; Hydrophobic = 1; aromatic = 2; aliphatic = 3 + "@ATTRIBUTE hydrophobicity NUMERIC\n" + + // Polarity unpolar = 0 polar = 1; positive = 2; negative = 3; + "@ATTRIBUTE polarity NUMERIC\n" + + // Size Tiny = 0; Small = 1; Large = 2; + "@ATTRIBUTE size NUMERIC\n"); for (int i = -8; i <= 8; i++) { - attributes.append("@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"); // amino acid at position $i from current amino acid + attributes.append("@ATTRIBUTE aa_position_" + i + " NUMERIC\n"); // amino acid at position $i from current amino acid } attributes.append("@ATTRIBUTE in_helix NUMERIC\n"); // Helix = 1 Other = 0 @@ -492,14 +494,14 @@ out.println(data); // HashMap containing information about the properties of every amino acid - HashMap<String, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + HashMap<String, String> resdata = AminoAcids.getAminoAcidNumericArffAttributeMap(); + HashMap<String, String> resnum = AminoAcids.getAminoAcidNumber(); ArrayList<Resource> positives = model.getPositives(); ArrayList<Resource> negatives = model.getNegatives(); - Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); - Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); String sequence = protein.getSequence(); HashMap<Integer, Resource> posRes = model.getPositionResource(); + for ( int i = 0; i < sequence.length(); i++) { StringBuffer dataLine = new StringBuffer(""); String key = Character.toString( sequence.charAt(i) ); @@ -509,13 +511,14 @@ dataLine.append( resdata.get(key) + "," ); } else { // - dataLine.append( resdata.get("U") + "," ); + dataLine.append( resdata.get("X") + "," ); } // add information about neighbouring amino acids to dataLine for (int j = (i - 8); j <= (i + 8) ; j++){ try { - dataLine.append( protein.getSequence().charAt(j) + "," ); + dataLine.append( resnum.get( + Character.toString(protein.getSequence().charAt(j))) + "," ); } catch (IndexOutOfBoundsException e) { dataLine.append( "?," ); } @@ -538,50 +541,96 @@ } catch (FileNotFoundException e){ e.printStackTrace(); } + } + + private static void createNominalArffFile(String pdbDir, PDBIdRdfModel model, ResIterator firstAAs){ + + try { + PDBProtein protein = model.getProtein(); + String arffFilePath = pdbDir + protein.getArffFileName(); + arffFilePath = arffFilePath.replace(".arff", ".nominal.arff"); + PrintStream out = new PrintStream (arffFilePath); + _logger.debug("Creating nominal ARFF file: " + arffFilePath); + + /* + * RELATION + */ + String relation = "@RELATION " + protein.getPdbID(); + out.println(relation); + _logger.debug(relation); + /* + * ATTRIBUTES + */ + // Integer declaring Position in chain + StringBuffer attributes = new StringBuffer( + // Hydrophobicity hydrophilic = 0; hydrophobic = 1; aromatic = 2; aliphatic = 3 + "@ATTRIBUTE hydrophob {hydrophilic, hydrophobic, aromatic, aliphatic}\n" + // Hydrophilic = 0; Hydrophobic = 1; Very_hydrophobic = 2 + // Polarity unpolar = 0; polar = 1; positive = 2; negative = 3; + "@ATTRIBUTE charge {unpolar, polar, positive, negative}\n" + // Negative = -1; Neutral = 0; Positive = 1 + // Size tiny = 0; small = 1; large = 2; + "@ATTRIBUTE size {tiny, small, large}\n"); + + for (int i = -8; i <= 8; i++) { + attributes.append("@ATTRIBUTE aa_position_" + i + " {A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y}\n"); // amino acid at position $i from current amino acid + } + attributes.append("@ATTRIBUTE in_helix {Helix, Non_helix}\n"); // Helix = 1 Other = 0 + + _logger.debug(attributes); + out.println(attributes); -/* // to be exchanged - while (firstAAs.hasNext()){ - Resource firstAA = firstAAs.next(); - Resource currentAA = firstAA; - Resource nextAA = firstAA; + /* + * @DATA + */ + String data = "@DATA\n"; + _logger.debug(data); + out.println(data); + + // HashMap containing information about the properties of every amino acid + HashMap<String, String> resdata = AminoAcids.getAminoAcidNominalArffAttributeMap(); + ArrayList<Resource> positives = model.getPositives(); + ArrayList<Resource> negatives = model.getNegatives(); + String sequence = protein.getSequence(); + HashMap<Integer, Resource> posRes = model.getPositionResource(); + + for ( int i = 0; i < sequence.length(); i++) { + StringBuffer dataLine = new StringBuffer(""); + String key = Character.toString( sequence.charAt(i) ); + + // add amino acid description to dataLine + if ( resdata.containsKey(key) ){ + dataLine.append( resdata.get(key) + "," ); + } else { + // + dataLine.append( resdata.get("X") + "," ); + } + // add information about neighbouring amino acids to dataLine + for (int j = (i - 8); j <= (i + 8) ; j++){ + try { + dataLine.append( protein.getSequence().charAt(j) + "," ); + } catch (IndexOutOfBoundsException e) { + dataLine.append( "?," ); + } + } + // add information about positive or negative to dataLine + if (positives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "Helix" ); + } else if (negatives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "Non_helix" ); + } else { + dataLine.append( "?" ); + } - for ( int i = 0; currentAA.hasProperty(iib); i++ ) { - StringBuffer dataLine = new StringBuffer(""); - currentAA = nextAA; - - NodeIterator niter = model.getModel().listObjectsOfProperty(currentAA, type); - while (niter.hasNext()){ - Resource key = niter.next().asResource(); - if (resdata.containsKey(key)){ - dataLine.append( resdata.get(key) + "," ); - } - } - - for (int j = (i - 8); j <= (i + 8) ; j++){ - try { - dataLine.append( protein.getSequence().charAt(j) + "," ); - } catch (IndexOutOfBoundsException e) { - dataLine.append( "?," ); - } - } - - if (positives.contains(currentAA)){ - dataLine.append( "1" ); - } else if (negatives.contains(currentAA)){ - dataLine.append( "0" ); - } else { - dataLine.append( "?" ); - } + _logger.info(dataLine); + out.println(dataLine); + + } + + } catch (FileNotFoundException e){ + e.printStackTrace(); + } + } - - - // get next AA if there is one - if (model.getModel().contains(currentAA, iib)){ - nextAA = model.getModel().getProperty(currentAA, iib).getResource(); - } -*/ - } } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-23 14:24:16 UTC (rev 3432) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-23 18:49:11 UTC (rev 3433) @@ -52,7 +52,7 @@ this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); System.out.println("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); - _positionResource = createPositionResidueMap(); + this._positionResource = createPositionResidueMap(); } public PdbRdfModel getModel(){ @@ -160,15 +160,15 @@ Resource polymerSequence = ResourceFactory.createResource("http://bio2rdf.org/pdb:PolymerSequence"); ResIterator riter = model.listResourcesWithProperty(type, polymerSequence); - while (riter.hasNext()){ - Resource nextRes = riter.next(); + while (riter.hasNext()) { + Resource nextRes = riter.nextResource(); if (model.contains(nextRes, hasValue)){ NodeIterator niter = model.listObjectsOfProperty(nextRes, hasValue); sequence = niter.next().toString(); System.out.println("Sequence: " + sequence); } - } + } ; return sequence; } @@ -230,7 +230,6 @@ // a amino acid is followed " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . " + " ?x5 pdb:hasChainPosition ?x8 ." + - " ?x8 rdfs:label ?residuePosition ." + " ?x8 pdb:hasValue ?x9 Filter (xsd:int(?x9)) ."; if (chainID.length() == 1 && pdbID.length() == 4) { @@ -283,7 +282,7 @@ Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); ResIterator firstAAs = this.getFirstAA(); - while ( firstAAs.hasNext()){ + while ( firstAAs.hasNext()) { Resource firstAA = firstAAs.next(); Resource currentAA = firstAA; posres.put(new Integer(this.getResiduePosition(currentAA)), currentAA); @@ -311,8 +310,7 @@ while ( positionLabelNodes.hasNext() ) { positionLabels.add(positionLabelNodes.next().asLiteral().getInt()); } - - } + } Integer position = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |