From: <rv...@us...> - 2012-02-01 21:00:38
|
Revision: 1029 http://treebase.svn.sourceforge.net/treebase/?rev=1029&view=rev Author: rvos Date: 2012-02-01 21:00:31 +0000 (Wed, 01 Feb 2012) Log Message: ----------- Adding state set mappers, which help translate character state sequences from ones that contain mini-syntax for indicating ambiguity as {ACGT} to IUPAC single character ambiguity codes such as N. Added Paths: ----------- trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapper.java trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperDna.java trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperProtein.java trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperRna.java trunk/treebase-core/src/test/java/org/cipres/treebase/domain/matrix/StateSetMapperTest.java Added: trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapper.java =================================================================== --- trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapper.java (rev 0) +++ trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapper.java 2012-02-01 21:00:31 UTC (rev 1029) @@ -0,0 +1,93 @@ +package org.cipres.treebase.domain.matrix; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +/** + * This class (and its subclasses) help with mapping between sets + * of fundamental states (such as {ACGT}) and IUPAC single character + * ambiguity symbols. This is especially important for valid NeXML + * generation, which requires that molecular <seq> elements only + * contain single character symbols, not sets. + * @author rvosa + * + */ +public abstract class StateSetMapper { + private Map<Character,HashSet<Character>> symbolToAmbig; + private Map<HashSet<Character>,Character> ambigToSymbol; + + /** + * Returns a map keyed on IUPAC single character ambiguity symbols + * http://droog.gs.washington.edu/parc/images/iupac.html with values + * containing a set (possibly of 1) with fundamental states. + * @return + */ + protected abstract Map<Character,HashSet<Character>> makeSymbolToAmbigMap(); + + + public StateSetMapper () { + symbolToAmbig = makeSymbolToAmbigMap(); + ambigToSymbol = new HashMap<HashSet<Character>,Character>(); + for ( Character key : symbolToAmbig.keySet() ) { + HashSet<Character> value = symbolToAmbig.get(key); + ambigToSymbol.put(value, key); + } + } + + /** + * Instantiates appropriate subclass given datatype + * @param datatype - one of the types in MatrixDataType + * @return + */ + public static StateSetMapper createMapperForDataType(String datatype) { + if ( MatrixDataType.MATRIX_DATATYPE_DNA.equals(datatype) ) { + return new StateSetMapperDna(); + } + else if ( MatrixDataType.MATRIX_DATATYPE_RNA.equals(datatype) ) { + return new StateSetMapperRna(); + } + else if ( MatrixDataType.MATRIX_DATATYPE_PROTEIN.equals(datatype) ) { + return new StateSetMapperProtein(); + } + else { + return null; + } + } + + /** + * Looks up the IUPAC single character symbol for a set of fundamental states + * @return + */ + public Character getSymbolForAmbiguousSet(HashSet<Character> ambiguousSet) { + return ambigToSymbol.get(ambiguousSet); + } + + /** + * Looks up the set of fundamental states for an IUPAC single character symbol + * @return + */ + public HashSet<Character> getAmbiguousSetForSymbol(Character symbol) { + return symbolToAmbig.get(symbol); + } + + /** + * Returns a data type from MatrixDataType.MATRIX_DATATYPE_.* + * @return + */ + public abstract String getDataType(); + + /** + * Helper method for populating the underlying Map object + * @param symbol + * @param symbolArray + * @param result + */ + protected void addMapping(Character symbol,Character[] symbolArray,Map<Character, HashSet<Character>> result) { + HashSet<Character> symbolSet = new HashSet<Character>(symbolArray.length); + for ( int i = 0; i < symbolArray.length; i++ ) { + symbolSet.add(symbolArray[i]); + } + result.put(symbol, symbolSet); + } +} Added: trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperDna.java =================================================================== --- trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperDna.java (rev 0) +++ trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperDna.java 2012-02-01 21:00:31 UTC (rev 1029) @@ -0,0 +1,65 @@ +package org.cipres.treebase.domain.matrix; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import org.cipres.treebase.domain.matrix.MatrixDataType; + +public class StateSetMapperDna extends StateSetMapper { + + /* + * (non-Javadoc) + * @see org.cipres.treebase.domain.matrix.StateSetMapper#makeSymbolToAmbigMap() + */ + @Override + protected Map<Character, HashSet<Character>> makeSymbolToAmbigMap() { + return this.makeSymbolToAmbigMapWithSymbols('A', 'C', 'G', 'T'); + } + + /** + * + * @param A + * @param C + * @param G + * @param T + * @return + */ + protected Map<Character, HashSet<Character>> makeSymbolToAmbigMapWithSymbols(char A, char C, char G, char T) { + Map<Character, HashSet<Character>> result = new HashMap<Character, HashSet<Character>>(); + + addMapping(A,new Character[]{A}, result); // A + addMapping(C,new Character[]{C}, result); // C + addMapping(G,new Character[]{G}, result); // G + addMapping(T,new Character[]{T}, result); // T + + addMapping('M',new Character[]{A,C}, result); // M => (A,C) + addMapping('R',new Character[]{A,G}, result); // R => (A,G) + addMapping('W',new Character[]{A,T}, result); // W => (A,T) + addMapping('S',new Character[]{C,G}, result); // S => (C,G) + addMapping('Y',new Character[]{C,T}, result); // Y => (C,T) XXX + addMapping('K',new Character[]{G,T}, result); // K => (G,T) + + addMapping('V',new Character[]{A,C,G}, result); // V => (A,C,G) + addMapping('H',new Character[]{A,C,T}, result); // H => (A,C,T) + addMapping('D',new Character[]{A,G,T}, result); // D => (A,G,T) + addMapping('B',new Character[]{C,G,T}, result); // B => (C,G,T) + + addMapping('N',new Character[]{A,C,G,T}, result); // N => (A,C,G,T) + addMapping('?',new Character[]{A,C,G,T,'-'}, result); // ? => (A,C,G,T,-) + addMapping('-',new Character[]{'-'}, result); // - => () + + return result; + } + + /** + * Returns MatrixDataType.MATRIX_DATATYPE_DNA + */ + @Override + public String getDataType() { + return MatrixDataType.MATRIX_DATATYPE_DNA; + } + + + + +} Added: trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperProtein.java =================================================================== --- trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperProtein.java (rev 0) +++ trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperProtein.java 2012-02-01 21:00:31 UTC (rev 1029) @@ -0,0 +1,56 @@ +package org.cipres.treebase.domain.matrix; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import org.cipres.treebase.domain.matrix.MatrixDataType; + +public class StateSetMapperProtein extends StateSetMapper { + + /* + * (non-Javadoc) + * @see org.cipres.treebase.domain.matrix.StateSetMapper#makeSymbolToAmbigMap() + */ + @Override + protected Map<Character, HashSet<Character>> makeSymbolToAmbigMap() { + Map<Character, HashSet<Character>> result = new HashMap<Character, HashSet<Character>>(); + + addMapping('A',new Character[]{'A'}, result); // Alanine + addMapping('B',new Character[]{'B'}, result); // Aspartic acid or asparagine + addMapping('C',new Character[]{'C'}, result); // Cysteine + addMapping('D',new Character[]{'D'}, result); // Aspartic acid + addMapping('E',new Character[]{'E'}, result); // Glutamic acid + addMapping('F',new Character[]{'F'}, result); // Phenylanine + addMapping('G',new Character[]{'G'}, result); // Glycine + addMapping('H',new Character[]{'H'}, result); // Histidine + addMapping('I',new Character[]{'I'}, result); // Isoleucine + addMapping('K',new Character[]{'K'}, result); // Lysine + addMapping('L',new Character[]{'L'}, result); // Leucine + addMapping('M',new Character[]{'M'}, result); // Methionine + addMapping('N',new Character[]{'N'}, result); // Asparagine + addMapping('P',new Character[]{'P'}, result); // Proline + addMapping('Q',new Character[]{'Q'}, result); // Glutamine + addMapping('R',new Character[]{'R'}, result); // Arginine + addMapping('S',new Character[]{'S'}, result); // Serine + addMapping('T',new Character[]{'T'}, result); // Threonine + addMapping('U',new Character[]{'U'}, result); // Selenocysteine + addMapping('V',new Character[]{'V'}, result); // Valine + addMapping('W',new Character[]{'W'}, result); // Tryptophan + addMapping('Y',new Character[]{'Y'}, result); // Tyrosine + addMapping('Z',new Character[]{'Z'}, result); // Glutamic acid or glutamine + addMapping('X',new Character[]{'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','U','V','W','Y','Z'}, result); + addMapping('?',new Character[]{'A','B','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','U','V','W','Y','Z','-'}, result); + addMapping('-',new Character[]{'-'}, result); + + return result; + } + + /** + * Returns MatrixDataType.MATRIX_DATATYPE_PROTEIN + */ + @Override + public String getDataType() { + return MatrixDataType.MATRIX_DATATYPE_PROTEIN; + } + +} Added: trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperRna.java =================================================================== --- trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperRna.java (rev 0) +++ trunk/treebase-core/src/main/java/org/cipres/treebase/domain/matrix/StateSetMapperRna.java 2012-02-01 21:00:31 UTC (rev 1029) @@ -0,0 +1,25 @@ +package org.cipres.treebase.domain.matrix; + +import java.util.HashSet; +import java.util.Map; +import org.cipres.treebase.domain.matrix.MatrixDataType; + +public class StateSetMapperRna extends StateSetMapperDna { + + /* + * (non-Javadoc) + * @see org.cipres.treebase.domain.matrix.StateSetMapperDna#makeSymbolToAmbigMap() + */ + @Override + protected Map<Character, HashSet<Character>> makeSymbolToAmbigMap() { + return makeSymbolToAmbigMapWithSymbols('A', 'C', 'G', 'U'); + } + + /** + * Returns MatrixDataType.MATRIX_DATATYPE_RNA + */ + @Override + public String getDataType() { + return MatrixDataType.MATRIX_DATATYPE_RNA; + } +} Added: trunk/treebase-core/src/test/java/org/cipres/treebase/domain/matrix/StateSetMapperTest.java =================================================================== --- trunk/treebase-core/src/test/java/org/cipres/treebase/domain/matrix/StateSetMapperTest.java (rev 0) +++ trunk/treebase-core/src/test/java/org/cipres/treebase/domain/matrix/StateSetMapperTest.java 2012-02-01 21:00:31 UTC (rev 1029) @@ -0,0 +1,54 @@ +package org.cipres.treebase.domain.matrix; + +import junit.framework.Assert; +import junit.framework.TestCase; + +import org.cipres.treebase.domain.matrix.MatrixDataType; +import org.cipres.treebase.domain.matrix.MatrixRow; +import org.cipres.treebase.domain.matrix.StateSetMapper; + +public class StateSetMapperTest extends TestCase { + /** + * The inputSeq contains all possible permutations of the fundamental + * states, which are to be mapped onto the IUPAC single character symbols + */ + public void testDnaMapper() { + String inputSeq = "ACGTK{GT}M{AC}R{AG}S{CG}W{AT}B{CGT}D{AGT}H{ACT}V{ACG}N{ACGT}-?{ACGT-}"; + String outputSeq = "ACGTKKMMRRSSWWBBDDHHVVNN-??"; + String result = MatrixRow.buildNormalizedSymbolString(StateSetMapper.createMapperForDataType(MatrixDataType.MATRIX_DATATYPE_DNA), inputSeq); + Assert.assertEquals(outputSeq, result); + } + + /** + * The inputSeq contains all possible permutations of the fundamental + * states, which are to be mapped onto the IUPAC single character symbols + */ + public void testRnaMapper() { + String inputSeq = "ACGUK{GU}M{AC}R{AG}S{CG}W{AU}B{CGU}D{AGU}H{ACU}V{ACG}N{ACGU}-?{ACGU-}"; + String outputSeq = "ACGUKKMMRRSSWWBBDDHHVVNN-??"; + String result = MatrixRow.buildNormalizedSymbolString(StateSetMapper.createMapperForDataType(MatrixDataType.MATRIX_DATATYPE_RNA), inputSeq); + Assert.assertEquals(outputSeq, result); + } + + /** + * The inputSeq contains all possible permutations of the fundamental + * states, which are to be mapped onto the IUPAC single character symbols + */ + public void testProteinMapper() { + String inputSeq = "ABCDEFGHIKLMNPQRSTUVWYZ{ABCDEFGHIKLMNPQRSTUVWYZ}{ABCDEFGHIKLMNPQRSTUVWYZ-}-"; + String outputSeq = "ABCDEFGHIKLMNPQRSTUVWYZX?-"; + String result = MatrixRow.buildNormalizedSymbolString(StateSetMapper.createMapperForDataType(MatrixDataType.MATRIX_DATATYPE_PROTEIN), inputSeq); + Assert.assertEquals(outputSeq, result); + } + + /** + * For STANDARD data there is no implied mapping. For now we will simply + * pass these through, though that's technically not valid. + */ + public void testStandardMapper() { + String inputSeq = "012{1}"; + String outputSeq = "012{1}"; + String result = MatrixRow.buildNormalizedSymbolString(StateSetMapper.createMapperForDataType(MatrixDataType.MATRIX_DATATYPE_STANDARD), inputSeq); + Assert.assertEquals(outputSeq, result); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |