From: Matthias W. <ma...@gm...> - 2010-05-04 10:33:07
|
Dear folks, I was interested in analyzing the Canonical Smiles capabilities of the CDK and like to share my results with you. To do so, I was developing a short script that is using the AtomContainerAtomPermutator class to permutate atom orders and computes afterwards the Smiles of each resulting permutation. Resulting Smiles of each permutation step are compared to Smiles of the former round. As dataset I was using a subset of an older version of the Asinex Gold catalogue that I had lying around here, containing 23075 compounds. Each atom set of each AtomContainer was permutated 200 times. Error rate was around 0.13%, quite ok I would say. :) Still, some examples where the canonicalisation failed: c1coc(c1)C=NNc2nnn[nH]2 c1coc(c1)C=NNc2[nH]nnn2 Fc2ccccc2(C=NNc1nnn[nH]1) Fc2ccccc2(C=NNc1[nH]nnn1) CCCCCCCCCCc1ncc[nH]1 CCCCCCCCCCc1[nH]ccn1 COc1ccc(cc1)C=NNC(=N)N COc1ccc(cc1)C=NNC(N)=N COc1cccc(C=NNC(=N)N)c1 COc1cccc(C=NNC(N)=N)c1 c1ccc(cc1)C2=NC(N=C(N2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl c1ccc(cc1)C=2NC(=NC(N=2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl COc1ccc(cc1)C2=NC(N=C(N2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl COc1ccc(cc1)C=2NC(=NC(N=2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl CN(C)c1ccc(cc1)C2=NC(N=C(N2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl CN(C)c1ccc(cc1)C=2NC(=NC(N=2)C(Cl)(Cl)Cl)C(Cl)(Cl)Cl O=C(O)CSCCc1ccncc1 OC(=O)CSCCc1ccncc1 COc2cc(OC)c(cc2(C=NNc1nnn[nH]1))OC COc2cc(OC)c(cc2(C=NNc1[nH]nnn1))OC c2cc(C=NNc1nnn[nH]1)c3N=S=Nc3(c2) c2cc(C=NNc1[nH]nnn1)c3N=S=Nc3(c2) CCCCCCc1ccc(cc1)CC(=O)O CCCCCCc1ccc(cc1)CC(O)=O CCCCCCCCCc1ccc(cc1)C(=O)O CCCCCCCCCc1ccc(cc1)C(O)=O CCN(CC)CC(=O)O CCN(CC)CC(O)=O CCCCCCC1CCC(CC1)C(=O)O CCCCCCC1CCC(CC1)C(O)=O CCSC1=NCN(CCCC(=O)O)CN1 CCSC=1NCN(CCCC(=O)O)CN=1 O=C(Nc1nnn[nH]1)c2ccco2 O=C(Nc1[nH]nnn1)c2ccco2 O=C(Nc1nnn[nH]1)c2ccc(F)cc2 O=C(Nc1[nH]nnn1)c2ccc(F)cc2 O=C(O)CCCCCCCCCCC1COCCO1 OC(=O)CCCCCCCCCCC1COCCO1 CC(CCCCCCCCCCCCCO)(C(=O)O)C(=O)O CC(CCCCCCCCCCCCCO)(C(O)=O)C(=O)O CC(CCCCCCCCCCCCCCCO)(C(=O)O)C(=O)O CC(CCCCCCCCCCCCCCCO)(C(O)=O)C(=O)O O=C(O)CN1CCCCCC1 OC(=O)CN1CCCCCC1 O=C(Nc1nnn[nH]1)c2cccc(c2)[N+](=O)[O-] O=C(Nc1[nH]nnn1)c2cccc(c2)[N+](=O)[O-] O=[N+]([O-])c2cc3OCOc3(cc2(C=NNc1nnn[nH]1)) O=[N+]([O-])c2cc3OCOc3(cc2(C=NNc1[nH]nnn1)) CC(C)(CN)C(O)C(=O)O CC(C)(CN)C(O)C(O)=O CCCOc1ccc(CCC(=O)O)cc1 CCCOc1ccc(CCC(O)=O)cc1 Cc2ccc3nc(Cc1nnn[nH]1)[nH]c3(c2) Cc2ccc3nc(Cc1[nH]nnn1)[nH]c3(c2) CCCCSc1nnn[nH]1 CCCCSc1[nH]nnn1 CCCCOc1cccc(c1)C2(CCCC2)C(=O)O CCCCOc1cccc(c1)C2(CCCC2)C(O)=O Mainly tetrazols and carboxylic acids seem to produce errors in some cases. Maybe one could implement an automatic lookup dealing with those ?! I have attached the source code of my test class. Eager to get some feedback. All the best, Matthias +++++++++++++++++++++++ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Iterator; import org.openscience.cdk.DefaultChemObjectBuilder; import org.openscience.cdk.Molecule; import org.openscience.cdk.exception.InvalidSmilesException; import org.openscience.cdk.graph.AtomContainerAtomPermutor; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IMolecule; import org.openscience.cdk.io.iterator.DefaultIteratingChemObjectReader; import org.openscience.cdk.io.iterator.IteratingMDLReader; import org.openscience.cdk.io.iterator.IteratingSMILESReader; import org.openscience.cdk.smiles.SmilesGenerator; public class CanoTest { private static int SMILES = 0; private static int SDF = 1; private boolean debug = true; public CanoTest( String filename, int type, int noOfPermutations ) throws FileNotFoundException, InvalidSmilesException { if(debug)System.out.println("dataset:\t"+filename); int totalFailed = 0, totalPassed = 0, molcount = 0; DefaultIteratingChemObjectReader reader = null; if( type == CanoTest.SDF ){ reader = new IteratingMDLReader( new FileInputStream(filename), DefaultChemObjectBuilder.getInstance()); } else if( type == CanoTest.SMILES ){ reader = new IteratingSMILESReader( new FileInputStream(filename) ); } IMolecule aicont = null; if(debug)System.out.println(); ArrayList<FailedMolObj> failList = new ArrayList<FailedMolObj>(); while( reader.hasNext() ) { if( molcount>0 && molcount%1000==0 ){ if(debug) System.out.println( "numMolsProcessed: "+molcount ); } if( type == CanoTest.SDF ){ aicont = (IMolecule)reader.next(); } else if( type == CanoTest.SMILES ){ aicont = (IMolecule)reader.next(); } boolean passed = true; String[] res = test( aicont, noOfPermutations ); for (int j = 0; j < res.length; j++) { if( j>0 ) { if( ! res[j].equals(res[j-1]) ) { passed = false; failList.add( new FailedMolObj(molcount+1,res[j-1],res[j]) ); break; } } } if( ! passed ){ totalFailed++; } else{ totalPassed++; } molcount++; } if(debug)System.out.println("done ..."); if(debug)System.out.println("+++++++++++++++++++"); if(debug)System.out.println(); if(debug)System.out.println( "total Molecules\t"+molcount ); if(debug)System.out.println( "total that passed\t"+totalPassed ); if(debug)System.out.println( "total that failed\t"+totalFailed ); if(debug)System.out.println("\n\n"); for (Iterator iterator = failList.iterator(); iterator.hasNext();) { FailedMolObj failedMolObj = (FailedMolObj) iterator.next(); System.out.println( failedMolObj.getID()+"\t"+failedMolObj.getSmilesBefore()+"\t"+failedMolObj.getSmilesAfter() ); } } public String[] test(IMolecule molecule, int noOfPermutations) throws InvalidSmilesException { String[] res = new String[noOfPermutations]; SmilesGenerator gen = new SmilesGenerator(true); IAtomContainer atomContainer = molecule; AtomContainerAtomPermutor acap = new AtomContainerAtomPermutor( atomContainer ); int count=0; while( acap.hasNext() && count< noOfPermutations ) { IAtomContainer tmp = acap.next(); res[count]=gen.createSMILES( new Molecule(tmp)); count++; } return res; } /** * @param args */ public static void main(String[] args) { if( args.length == 2 ) { file = args[0]; try { if( file.substring( file.length()-3 ).equals("sdf") ) { new CanoTest( file, CanoTest.SDF , Integer.parseInt( args[1] ) ); } else if( file.substring( file.length()-3 ).equals("smi") ) { new CanoTest( file, CanoTest.SMILES , Integer.parseInt( args[1] ) ); } else { System.out.println("*.sdf for SD files or *.smi for SMILES"); } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidSmilesException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else System.out.println("two parameters please - filename and iteration size"); } class FailedMolObj { Integer ID; String SmilesAfter,SmilesBefore; public FailedMolObj(Integer id, String mol1, String mol2) { this.setID(id); this.setSmilesAfter(mol2); this.setSmilesBefore(mol1); } public Integer getID() { return ID; } public void setID(Integer id) { ID = id; } public String getSmilesAfter() { return SmilesAfter; } public void setSmilesAfter(String smiles) { SmilesAfter = smiles; } public String getSmilesBefore() { return SmilesBefore; } public void setSmilesBefore(String smiles) { SmilesBefore = smiles; } } } -- GRATIS für alle GMX-Mitglieder: Die maxdome Movie-FLAT! Jetzt freischalten unter http://portal.gmx.net/de/go/maxdome01 |