From: <km...@us...> - 2011-11-11 15:05:35
|
Revision: 3396 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=3396&view=rev Author: kmpf Date: 2011-11-11 15:05:29 +0000 (Fri, 11 Nov 2011) Log Message: ----------- Untested workaround to get the positive/negative class mapping correct. Modified Paths: -------------- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/AminoAcids.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -117,29 +117,29 @@ return resourceString; } - public static HashMap<Resource, String> getAminoAcidArffAttributeMap(){ - HashMap<Resource, String> resdata = new HashMap<Resource, String>(30); - resdata.put(ALA, new String("2,0,0.5,?,?")); - resdata.put(CYS, new String("1,0,1,?,0")); - resdata.put(ASP, new String("0,-1,1,?,-1")); - resdata.put(GLU, new String("0,-1,2,?,-1")); - resdata.put(PHE, new String("2,0,2,1,?")); - resdata.put(GLY, new String("2,0,0.5,?,?")); - resdata.put(HIS, new String("1,1,2,1,1")); - resdata.put(ILE, new String("2,0,2,0,?")); - resdata.put(LYS, new String("1,1,2,?,1")); - resdata.put(LEU, new String("2,0,2,0,?")); - resdata.put(MET, new String("2,0,2,?,?")); - resdata.put(ASN, new String("0,0,1,?,0")); - resdata.put(PRO, new String("?,0,1,?,?")); - resdata.put(GLN, new String("0,0,2,?,0")); - resdata.put(ARG, new String("0,1,2,?,1")); - resdata.put(SER, new String("0,0,0.5,?,0")); - resdata.put(THR, new String("1,0,1,?,0,")); - resdata.put(VAL, new String("2,0,1,0,?")); - resdata.put(TRP, new String("1,0,2,1,1")); - resdata.put(TYR, new String("1,0,2,1,0")); - resdata.put(SEL, new String("?,?,?,?,?")); + public static HashMap<String, String> getAminoAcidArffAttributeMap(){ + HashMap<String, String> resdata = new HashMap<String, String>(30); + resdata.put(new String("A"), new String("2,0,0.5,?,?")); + resdata.put(new String("C"), new String("1,0,1,?,0")); + resdata.put(new String("D"), new String("0,-1,1,?,-1")); + resdata.put(new String("E"), new String("0,-1,2,?,-1")); + resdata.put(new String("F"), new String("2,0,2,1,?")); + resdata.put(new String("G"), new String("2,0,0.5,?,?")); + resdata.put(new String("H"), new String("1,1,2,1,1")); + resdata.put(new String("I"), new String("2,0,2,0,?")); + resdata.put(new String("K"), new String("1,1,2,?,1")); + resdata.put(new String("L"), new String("2,0,2,0,?")); + resdata.put(new String("M"), new String("2,0,2,?,?")); + resdata.put(new String("N"), new String("0,0,1,?,0")); + resdata.put(new String("P"), new String("?,0,1,?,?")); + resdata.put(new String("Q"), new String("0,0,2,?,0")); + resdata.put(new String("R"), new String("0,1,2,?,1")); + resdata.put(new String("S"), new String("0,0,0.5,?,0")); + resdata.put(new String("T"), new String("1,0,1,?,0,")); + resdata.put(new String("V"), new String("2,0,1,0,?")); + resdata.put(new String("W"), new String("1,0,2,1,1")); + resdata.put(new String("Y"), new String("1,0,2,1,0")); + resdata.put(new String("U"), new String("?,?,?,?,?")); return resdata; } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/HelixRDFCreator.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -156,6 +156,7 @@ _logger.info("PDB ID: " + protein.getPdbID()); _logger.info("chain ID: " + protein.getChainID()); + trainmodel = new PDBIdRdfModel(protein); if (fasta){ @@ -491,13 +492,55 @@ out.println(data); // HashMap containing information about the properties of every amino acid - HashMap<Resource, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); + HashMap<String, String> resdata = AminoAcids.getAminoAcidArffAttributeMap(); ArrayList<Resource> positives = model.getPositives(); ArrayList<Resource> negatives = model.getNegatives(); Property type = ResourceFactory.createProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "type"); Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); - + String sequence = protein.getSequence(); + HashMap<Integer, Resource> posRes = model.getPositionResource(); + for ( int i = 0; i < sequence.length(); i++) { + StringBuffer dataLine = new StringBuffer(""); + String key = Character.toString( sequence.charAt(i) ); + + // add amino acid description to dataLine + if ( resdata.containsKey(key) ){ + dataLine.append( resdata.get(key) + "," ); + } else { + // + dataLine.append( resdata.get("U") + "," ); + } + + // add information about neighbouring amino acids to dataLine + for (int j = (i - 8); j <= (i + 8) ; j++){ + try { + dataLine.append( protein.getSequence().charAt(j) + "," ); + } catch (IndexOutOfBoundsException e) { + dataLine.append( "?," ); + } + } + + // add information about positive or negative to dataLine + if (positives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "1" ); + } else if (negatives.contains( posRes.get( new Integer(i) ))){ + dataLine.append( "0" ); + } else { + dataLine.append( "?" ); + } + + _logger.info(dataLine); + out.println(dataLine); + + } + + } catch (FileNotFoundException e){ + e.printStackTrace(); + } + + +/* // to be exchanged while (firstAAs.hasNext()){ Resource firstAA = firstAAs.next(); Resource currentAA = firstAA; @@ -529,6 +572,8 @@ dataLine.append( "1" ); } else if (negatives.contains(currentAA)){ dataLine.append( "0" ); + } else { + dataLine.append( "?" ); } @@ -537,12 +582,6 @@ if (model.getModel().contains(currentAA, iib)){ nextAA = model.getModel().getProperty(currentAA, iib).getResource(); } - _logger.info(dataLine); - out.println(dataLine); - } - } - } catch (FileNotFoundException e){ - e.printStackTrace(); - } +*/ } } Modified: trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java =================================================================== --- trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-11 09:22:08 UTC (rev 3395) +++ trunk/scripts/src/main/java/org/dllearner/examples/pdb/PDBIdRdfModel.java 2011-11-11 15:05:29 UTC (rev 3396) @@ -5,7 +5,13 @@ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; +import java.util.HashMap; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; import org.xml.sax.InputSource; import com.dumontierlab.pdb2rdf.model.PdbRdfModel; @@ -27,13 +33,17 @@ public class PDBIdRdfModel { + private static Logger _logger = Logger.getRootLogger(); + private PdbRdfModel _pdbIdModel = new PdbRdfModel(); private PdbRdfModel _removedFromModel = new PdbRdfModel(); private PDBProtein _protein = null ; private ArrayList<Resource> _positives = null; private ArrayList<Resource> _negatives = null; + private HashMap<Integer, Resource> _positionResource = null; - public PDBIdRdfModel (PDBProtein protein){ + public PDBIdRdfModel (PDBProtein protein) { + this._protein = protein; this._pdbIdModel = this.getPdbRdfModel(); this.getProtein().setSequence(extractSequence(_pdbIdModel)); @@ -41,6 +51,7 @@ this.getProtein().setSpecies(extractSpecies(_pdbIdModel)); System.out.println("Species: " + this.getProtein().getSpecies()); createPositivesAndNegatives(); + _positionResource = createPositionResidueMap(); } public PdbRdfModel getModel(){ @@ -58,6 +69,10 @@ public ArrayList<Resource> getNegatives(){ return _negatives; } + + public HashMap<Integer, Resource> getPositionResource(){ + return _positionResource; + } private PdbRdfModel getPdbRdfModel() { String[] pdbIDs = {_protein.getPdbID()}; @@ -192,13 +207,15 @@ "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " + "PREFIX fn: <http://www.w3.org/2005/xpath-functions#> " + "CONSTRUCT { ?x1 pdb:beginsAt ?x2 ." + - " ?x1 pdb:endsAt ?x3 . " + - " ?x5 dcterms:isPartOf ?x4 . " + + " ?x1 pdb:endsAt ?x3 ." + + " ?x5 dcterms:isPartOf ?x4 ." + " ?x5 rdf:type ?x6 ." + " ?x5 pdb:isImmediatelyBefore ?x7 ." + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 rdfs:label ?residuePosition ." + " ?organism rdfs:label ?organismName ." + " ?seq rdf:type pdb:PolymerSequence ." + - " ?seq pdb:hasValue ?sequence. } " + + " ?seq pdb:hasValue ?sequence . } " + "WHERE { ?x1 rdf:type pdb:Helix ." + " ?x1 pdb:beginsAt ?x2 ." + " ?x1 pdb:endsAt ?x3 ." + @@ -206,7 +223,9 @@ " ?x4 rdf:type <http://bio2rdf.org/pdb:Polypeptide(L)> ." + " ?x5 dcterms:isPartOf ?x4 ." + " ?x5 rdf:type ?x6 ." + - " ?x5 pdb:hasChainPosition ?x8 . " + + " ?x5 pdb:hasChainPosition ?x8 ." + + " ?x8 dcterms:isPartOf ?x4 ." + + " ?x8 rdfs:label ?residuePosition ." + " OPTIONAL { ?x5 pdb:isImmediatelyBefore ?x7 . } . "; if (chainID.length() == 1 && pdbID.length() == 4) @@ -256,6 +275,59 @@ return niter; } + private HashMap<Integer, Resource> createPositionResidueMap(){ + + HashMap<Integer, Resource> posres = new HashMap<Integer, Resource>(150); + Property iib = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "isImmediatelyBefore"); + + ResIterator firstAAs = this.getFirstAA(); + while ( firstAAs.hasNext()){ + Resource firstAA = firstAAs.next(); + Resource nextAA = firstAA; + Resource currentAA = firstAA; + do { + currentAA = nextAA; + posres.put(new Integer(this.getResiduePosition(currentAA)), currentAA); + nextAA = _pdbIdModel.getProperty(currentAA, iib).getResource(); + } while (currentAA.hasProperty(iib)); + } + + return posres; + } + + private int getResiduePosition(Resource res) { + Property hasChainPosition = ResourceFactory.createProperty("http://bio2rdf.org/pdb:", "hasChainPosition"); + Property label = ResourceFactory.createProperty("http://www.w3.org/2000/01/rdf-schema#", "label"); + ResourceFactory.createResource(); + + NodeIterator residuePosition = _pdbIdModel.listObjectsOfProperty(res, hasChainPosition ); + ArrayList<RDFNode> positionNodes = new ArrayList<RDFNode>(); + ArrayList<String> positionLabels = new ArrayList<String>(); + while ( residuePosition.hasNext() ) { + RDFNode positionNode = residuePosition.next(); + positionNodes.add(positionNode); + NodeIterator positionLabelNodes = _pdbIdModel.listObjectsOfProperty( positionNode.asResource(), label ); + while ( positionLabelNodes.hasNext() ) { + positionLabels.add(positionLabelNodes.next().toString()); + } + + } + + + Integer position = null; + if ( positionNodes.size() == 1 && positionLabels.size() == 1 ) { + String positionLabel = positionLabels.get(0); + String a = new String( "Position " ); + String b = new String( " on chain" ); + position = Integer.parseInt( + positionLabel.substring(positionLabel.indexOf(a) + a.length(), positionLabel.indexOf(b))); + } else { + position = new Integer(0); + _logger.error(""); + } + return position.intValue(); + } + public void addDistanceInfo(){ String queryString = "PREFIX pdb: <http://bio2rdf.org/pdb:> " + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |