From: <hen...@us...> - 2010-06-20 09:37:41
|
Revision: 5042 http://sashimi.svn.sourceforge.net/sashimi/?rev=5042&view=rev Author: henrylam Date: 2010-06-20 09:37:35 +0000 (Sun, 20 Jun 2010) Log Message: ----------- [SpectraST] Modernize library statistics reporting in pepidx: higher charges, more modifications, etc Modified Paths: -------------- trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.cpp trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.hpp trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTLib.cpp trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.cpp trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.hpp Modified: trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.cpp =================================================================== --- trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.cpp 2010-06-19 16:18:57 UTC (rev 5041) +++ trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.cpp 2010-06-20 09:37:35 UTC (rev 5042) @@ -993,6 +993,41 @@ return ((unsigned int)(stripped.length())); } +// getAllPresentModTypes - lists all distinct (aa,modType) present on this peptide (and the number of times each occurs) +bool Peptide::getAllPresentModTypes(map<string, unsigned int>& presentModTypes) { + + if (!isModsSet || (mods.empty() && nTermMod.empty() && cTermMod.empty())) { + return (false); + } + + presentModTypes.clear(); + + if (!nTermMod.empty()) { + string tag("n,"); + tag += nTermMod; + presentModTypes[tag] = 1; + } + + if (!cTermMod.empty()) { + string tag("c,"); + tag += cTermMod; + presentModTypes[tag] = 1; + } + + for (map<int, string>::iterator i = mods.begin(); i != mods.end(); i++) { + stringstream tag; + tag << stripped[i->first] << ',' << i->second; + if (presentModTypes.find(tag.str()) != presentModTypes.end()) { + presentModTypes[tag.str()]++; + } else { + presentModTypes[tag.str()] = 1; + } + } + + return (true); + +} + // ==================================================================== // METHODS TO MANAGE AND ACCESS THE MASS TABLES Modified: trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.hpp =================================================================== --- trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.hpp 2010-06-19 16:18:57 UTC (rev 5041) +++ trunk/trans_proteomic_pipeline/src/Search/SpectraST/Peptide.hpp 2010-06-20 09:37:35 UTC (rev 5042) @@ -131,6 +131,7 @@ bool isUncleavableICAT(); bool isCAMCysteine(); bool hasUnmodifiedCysteine(); + bool getAllPresentModTypes(map<string, unsigned int>& presentModTypes); // method to introduce modifications by means of a MSP-style mod string bool parseMspModStr(string mspModStr, bool add = false); Modified: trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTLib.cpp =================================================================== --- trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTLib.cpp 2010-06-19 16:18:57 UTC (rev 5041) +++ trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTLib.cpp 2010-06-20 09:37:35 UTC (rev 5042) @@ -269,7 +269,7 @@ cout << endl; - m_pepIndex->printStats(); + m_pepIndex->printStats(cout); cout << endl; } Modified: trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.cpp =================================================================== --- trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.cpp 2010-06-19 16:18:57 UTC (rev 5041) +++ trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.cpp 2010-06-20 09:37:35 UTC (rev 5042) @@ -54,20 +54,17 @@ SpectraSTPeptideLibIndex::SpectraSTPeptideLibIndex(string idxFileName) : SpectraSTLibIndex(idxFileName, "Peptide"), m_map(), - m_strippedPeptideCount(0), + m_peptideSequenceCount(0), m_peptideIonCount(0), - m_spectrumCount(0), - m_charge1Count(0), - m_charge2Count(0), - m_charge3Count(0), - m_icatClCount(0), - m_icatUcCount(0), + m_peptideSpectrumCount(0), + m_chargeCount(6, 0), m_trypticCount(0), m_semitrypticCount(0), m_nontrypticCount(0), - m_camCount(0), - m_metOxCount(0), + m_modsCount(), m_nonPeptideCount(0), + m_nonPeptideIonCount(0), + m_nonPeptideSpectrumCount(0), m_isUnique(true), m_prob9999(0), m_prob999(0), @@ -88,20 +85,18 @@ SpectraSTPeptideLibIndex::SpectraSTPeptideLibIndex(string idxFileName, ifstream* libFinPtr, bool binaryLib) : SpectraSTLibIndex(idxFileName, libFinPtr, "Peptide", binaryLib), m_map(), - m_strippedPeptideCount(0), + m_peptideSequenceCount(0), m_peptideIonCount(0), - m_spectrumCount(0), - m_charge1Count(0), - m_charge2Count(0), - m_charge3Count(0), - m_icatClCount(0), - m_icatUcCount(0), + m_peptideSpectrumCount(0), + m_chargeCount(6, 0), m_trypticCount(0), m_semitrypticCount(0), - m_camCount(0), - m_metOxCount(0), + m_nontrypticCount(0), + m_modsCount(), m_nonPeptideCount(0), - m_isUnique(true), + m_nonPeptideIonCount(0), + m_nonPeptideSpectrumCount(0), + m_isUnique(true), m_prob9999(0), m_prob999(0), m_prob99(0), @@ -128,11 +123,41 @@ // insertEntry - used to insert an entry to the index void SpectraSTPeptideLibIndex::insertEntry(SpectraSTLibEntry* entry, fstream::off_type offset) { + double prob = entry->getProb(); + + if (prob > 0.9999) { + m_prob9999++; + } else if (prob > 0.999) { + m_prob999++; + } else if (prob > 0.99) { + m_prob99++; + } else if (prob > 0.9) { + m_prob9++; + } else { + m_prob0++; + } + + unsigned int nreps = entry->getNrepsUsed(); + + if (nreps >= 20) { + m_nreps20++; + } else if (nreps >= 10) { + m_nreps10++; + } else if (nreps >= 4) { + m_nreps4++; + } else if (nreps >= 2) { + m_nreps2++; + } else { + m_nreps1++; + } + + m_entryCount++; + Peptide* pep = entry->getPeptidePtr(); if (!pep) { - m_spectrumCount++; - m_nonPeptideCount++; + m_nonPeptideSpectrumCount++; + string::size_type slashPos = 0; int charge = entry->getCharge(); string name = entry->getName(); @@ -145,11 +170,31 @@ if (!(frag.empty())) { subkeyss << frag; } + + if (m_map.find(name) == m_map.end()) { + m_nonPeptideCount++; + m_nonPeptideIonCount++; + } else if ((m_map[name]).find(subkeyss.str()) == (m_map[name]).end()) { + m_nonPeptideIonCount++; + } else { + m_isUnique = false; + } + + if (charge > 5) { + m_chargeCount[5]++; + } else if (charge > 1) { + m_chargeCount[charge - 1]++; + } + ((m_map[name])[subkeyss.str()]).push_back(offset); return; } - + + // peptides + + m_peptideSpectrumCount++; + string mods = pep->mspMods(); // create key for sub-map. The sub-key is of the format <charge>|<mod string in .msp style> @@ -161,32 +206,36 @@ subkey << frag; } - // update counts - m_entryCount++; - m_spectrumCount++; map<string, map<string, vector<fstream::off_type> > >::iterator found = m_map.find(pep->stripped); if (found == m_map.end()) { // new stripped peptide - m_strippedPeptideCount++; - } - if (((m_map[pep->stripped])[subkey.str()]).size() == 0) { + m_peptideSequenceCount++; + m_peptideIonCount++; + } else if ((m_map[pep->stripped]).find(subkey.str()) == (m_map[pep->stripped]).end()) { // new peptide ion m_peptideIonCount++; } else { m_isUnique = false; } - if (pep->isCleavableICAT()) { - m_icatClCount++; - } - if (pep->isUncleavableICAT()) { - m_icatUcCount++; - } - if (pep->isCAMCysteine()) { - m_camCount++; - } - if (mods.find("M,Oxidation", 0) != string::npos) { - m_metOxCount++; + + if (pep->charge > 5) { + m_chargeCount[5]++; + } else if (pep->charge >= 1) { + m_chargeCount[pep->charge - 1]++; } + + map<string, unsigned int> presentModTypes; + if (pep->getAllPresentModTypes(presentModTypes)) { + for (map<string, unsigned int>::iterator m = presentModTypes.begin(); m != presentModTypes.end(); m++) { + string tag = m->first; + if (m_modsCount.find(tag) != m_modsCount.end()) { + m_modsCount[tag]++; + } else { + m_modsCount[tag] = 1; + } + } + } + if (pep->NTT() == 2) { m_trypticCount++; } else if (pep->NTT() == 1) { @@ -195,42 +244,6 @@ m_nontrypticCount++; } - if (pep->charge == 1) { - m_charge1Count++; - } else if (pep->charge == 2) { - m_charge2Count++; - } else if (pep->charge == 3) { - m_charge3Count++; - } - - double prob = entry->getProb(); - - if (prob > 0.9999) { - m_prob9999++; - } else if (prob > 0.999) { - m_prob999++; - } else if (prob > 0.99) { - m_prob99++; - } else if (prob > 0.9) { - m_prob9++; - } else { - m_prob0++; - } - - unsigned int nreps = entry->getNrepsUsed(); - - if (nreps >= 20) { - m_nreps20++; - } else if (nreps >= 10) { - m_nreps10++; - } else if (nreps >= 4) { - m_nreps4++; - } else if (nreps >= 2) { - m_nreps2++; - } else { - m_nreps1++; - } - // insert ((m_map[pep->stripped])[subkey.str()]).push_back(offset); @@ -250,21 +263,9 @@ idxFout << "### " << m_idxFileName << endl; idxFout << "### " << endl; - idxFout << "### Total number of spectra in library: " << m_spectrumCount << endl; - idxFout << "### Total number of distinct peptide ions in library: " << m_peptideIonCount << endl; - idxFout << "### Total number of distinct stripped peptides in library: " << m_strippedPeptideCount << endl; - - if (m_nonPeptideCount > 0) { - idxFout << "### Total number of non-peptide entries in library: " << m_nonPeptideCount << endl; - } - - idxFout << "### " << endl; - idxFout << "### CHARGE +1: " << m_charge1Count << " ; +2: " << m_charge2Count << " ; +3: " << m_charge3Count << endl; - idxFout << "### TERMINI Tryptic: " << m_trypticCount << " ; Semi-tryptic: " << m_semitrypticCount << " ; Non-tryptic: " << m_nontrypticCount << endl; - idxFout << "### METHIONINE MOD Oxidized: " << m_metOxCount << endl; - idxFout << "### CYSTEINE MOD CAM: " << m_camCount << " ; Cleavable-ICAT: " << m_icatClCount << " ; Uncleavable-ICAT: " << m_icatUcCount << endl; - idxFout << "### PROBABILITY >0.9999: " << m_prob9999 << " ; 0.999-0.9999: " << m_prob999 << " ; 0.99-0.999: " << m_prob99 << " ; 0.9-0.99: " << m_prob9 << " <0.9: " << m_prob0 << endl; - idxFout << "### NREPS 20+: " << m_nreps20 << " ; 10-19: " << m_nreps10 << " ; 4-9: " << m_nreps4 << " ; 2-3: " << m_nreps2 << " ; 1: " << m_nreps1 << endl; + + printStats(idxFout, "### "); + idxFout << "### ===" << endl; for (map<string, map<string, vector<fstream::off_type> > >::iterator i = m_map.begin(); i != m_map.end(); i++) { @@ -301,14 +302,12 @@ while (nextLine(idxFin, line, "### ===")); } - string lastPeptide(""); + string lastKey(""); while (nextLine(idxFin, line, "", "")) { string::size_type pos = 0; - string peptide = nextToken(line, pos, pos); - if (peptide != lastPeptide) { - m_strippedPeptideCount++; - } + + string key = nextToken(line, pos, pos); string subkey = nextToken(line, pos, pos); int charge = 0; string mods(""); @@ -316,46 +315,64 @@ parseSubkey(subkey, charge, mods, frag); Peptide* pep = NULL; - if (peptide[0] == '_') { + if (key[0] == '_') { // not a peptide - m_nonPeptideCount++; + m_nonPeptideIonCount++; + + if (key != lastKey) { + m_nonPeptideCount++; + } + } else { - pep = new Peptide(peptide, charge, mods); + pep = new Peptide(key, charge, mods); m_peptideIonCount++; + + if (key != lastKey) { + m_peptideSequenceCount++; + } } + unsigned int spectrumCount = 0; + while (pos < line.length()) { fstream::off_type offset = strtoull((nextToken(line, pos, pos)).c_str(), NULL, 10); - ((m_map[peptide])[subkey]).push_back(offset); + ((m_map[key])[subkey]).push_back(offset); - m_entryCount++; - m_spectrumCount++; + spectrumCount++; - if (!pep) { - pos++; - continue; + pos++; + } + + m_entryCount += spectrumCount; + + if (charge > 5) { + m_chargeCount[5] += spectrumCount; + } else if (charge >= 1) { + m_chargeCount[charge - 1] += spectrumCount; + } + + if (!pep) { + + m_nonPeptideSpectrumCount += spectrumCount; + + } else { + // is a peptide + + m_peptideSpectrumCount += spectrumCount; + + map<string, unsigned int> presentModTypes; + if (pep->getAllPresentModTypes(presentModTypes)) { + for (map<string, unsigned int>::iterator m = presentModTypes.begin(); m != presentModTypes.end(); m++) { + string tag = m->first; + if (m_modsCount.find(tag) != m_modsCount.end()) { + m_modsCount[tag] += spectrumCount; + } else { + m_modsCount[tag] = spectrumCount; + } + } } - - if (pep->charge == 1) { - m_charge1Count++; - } else if (pep->charge == 2) { - m_charge2Count++; - } - - if (pep->isCleavableICAT()) { - m_icatClCount++; - } - if (pep->isUncleavableICAT()) { - m_icatUcCount++; - } - if (pep->isCAMCysteine()) { - m_camCount++; - } - if (mods.find("M,Oxidation", 0) != string::npos) { - m_metOxCount++; - } - + if (pep->NTT() == 2) { m_trypticCount++; } else if (pep->NTT() == 1) { @@ -364,18 +381,15 @@ m_nontrypticCount++; } - - - pos++; - } - - if (((m_map[peptide])[subkey]).size() > 1) { + } // if (pep) + + if (((m_map[key])[subkey]).size() > 1) { m_isUnique = false; } - lastPeptide = peptide; + lastKey = key; - } + } // next line in .pepidx file } @@ -540,59 +554,55 @@ return (m_isUnique); } -// getStats - returns statistics -unsigned int SpectraSTPeptideLibIndex::getStats(string which) { +// printStats - prints out statistics +void SpectraSTPeptideLibIndex::printStats(ostream& out, string linePrefix) { - if (which == "spectrum") { - return m_spectrumCount; - } else if (which == "peptideIon") { - return m_peptideIonCount; - } else if (which == "strippedPeptide") { - return m_strippedPeptideCount; - } else if (which == "charge1") { - return m_charge1Count; - } else if (which == "charge2") { - return m_charge2Count; - } else if (which == "charge3") { - return m_charge3Count; - } else if (which == "tryptic") { - return m_trypticCount; - } else if (which == "semitryptic") { - return m_semitrypticCount; - } else if (which == "nontryptic") { - return m_nontrypticCount; - } else if (which == "metOx") { - return m_metOxCount; - } else if (which == "cam") { - return m_camCount; - } else if (which == "icatCl") { - return m_icatClCount; - } else if (which == "icatUc") { - return m_icatUcCount; - } else { - // no such stat - return (unsigned int)(-1); // note: cast added for mingw build with Werror; please check if this return val is correct as function is declared to return unsigned int (jmt) + out << linePrefix << "Total number of spectra in library: " << m_entryCount << endl; + + if (m_entryCount != m_peptideSpectrumCount) { + out << linePrefix << "Total number of peptide spectra in library: " << m_peptideSpectrumCount << endl; } - -} -// printStats - prints out statistics -void SpectraSTPeptideLibIndex::printStats() { - - cout << "Total number of spectra in library: " << m_spectrumCount << endl; - cout << "Total number of distinct peptide ions in library: " << m_peptideIonCount << endl; - cout << "Total number of distinct stripped peptides in library: " << m_strippedPeptideCount << endl; - if (m_nonPeptideCount > 0) { - cout << "Total number of non-peptide entries in library: " << m_nonPeptideCount << endl; + out << linePrefix << "Total number of distinct peptide ions in library: " << m_peptideIonCount << endl; + out << linePrefix << "Total number of distinct stripped peptides in library: " << m_peptideSequenceCount << endl; + + if (m_nonPeptideSpectrumCount > 0) { + out << linePrefix << "Total number of non-peptide spectra in library: " << m_nonPeptideSpectrumCount << endl; + out << linePrefix << "Total number of distinct non-peptide ions in library: " << m_nonPeptideIonCount << endl; + out << linePrefix << "Total number of distinct non-peptide entities in library: " << m_nonPeptideCount << endl; } + + out << linePrefix << endl; + + out << linePrefix << "CHARGE "; + for (unsigned int c = 1; c <= 5; c++) { + out << "+" << c << ": " << m_chargeCount[c - 1] << " ; "; + } + out << ">+5: " << m_chargeCount[5] << endl; - cout << "CHARGE +1: " << m_charge1Count << " ; +2: " << m_charge2Count << " ; +3: " << m_charge3Count << endl; - cout << "TERMINI Tryptic: " << m_trypticCount << " ; Semi-tryptic: " << m_semitrypticCount << " ; Non-tryptic: " << m_nontrypticCount << endl; - cout << "METHIONINE MOD Oxidized: " << m_metOxCount << endl; - cout << "CYSTEINE MOD CAM: " << m_camCount << " ; Cleavable-ICAT: " << m_icatClCount << " ; Uncleavable-ICAT: " << m_icatUcCount << endl; - cout << "PROBABILITY >0.9999: " << m_prob9999 << " ; 0.999-0.9999: " << m_prob999 << " ; 0.99-0.999: " << m_prob99 << " ; 0.9-0.99: " << m_prob9 << " <0.9: " << m_prob0 << endl; - cout << "NREPS 20+: " << m_nreps20 << " ; 10-19: " << m_nreps10 << " ; 4-9: " << m_nreps4 << " ; 2-3: " << m_nreps2 << " ; 1: " << m_nreps1 << endl; - + out << linePrefix << "TERMINI Tryptic: " << m_trypticCount << " ; Semi-tryptic: " << m_semitrypticCount << " ; Non-tryptic: " << m_nontrypticCount << endl; + out << linePrefix << "PROBABILITY >0.9999: " << m_prob9999 << " ; 0.999-0.9999: " << m_prob999 << " ; 0.99-0.999: " << m_prob99 << " ; 0.9-0.99: " << m_prob9 << " <0.9: " << m_prob0 << endl; + out << linePrefix << "NREPS 20+: " << m_nreps20 << " ; 10-19: " << m_nreps10 << " ; 4-9: " << m_nreps4 << " ; 2-3: " << m_nreps2 << " ; 1: " << m_nreps1 << endl; + + out << linePrefix << "MODIFICATIONS "; + char lastAA = '0'; + for (map<string, unsigned int>::iterator m = m_modsCount.begin(); m != m_modsCount.end(); m++) { + if (m->first[0] == lastAA) { + out << " ; "; + } else if (lastAA != '0') { + out << endl << linePrefix << " "; + } + out << m->first << ": " << m->second; + lastAA = m->first[0]; + } + + if (lastAA == '0') { + // no modification at all + out << "None"; + } + + out << endl; + } void SpectraSTPeptideLibIndex::getAllSequences(vector<string>& seqs) { Modified: trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.hpp =================================================================== --- trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.hpp 2010-06-19 16:18:57 UTC (rev 5041) +++ trunk/trans_proteomic_pipeline/src/Search/SpectraST/SpectraSTPeptideLibIndex.hpp 2010-06-20 09:37:35 UTC (rev 5042) @@ -78,9 +78,8 @@ // check if the library is a "unique" library, meaning each peptide ion is represented by only one spectrum bool isUniqueLibrary(); - // method to access the statistics - unsigned int getStats(string which); - void printStats(); + // method to print the statistics + void printStats(ostream& out, string linePrefix = ""); // method to extract all distinct peptide sequences void getAllSequences(vector<string>& seqs); @@ -89,20 +88,21 @@ private: // statistics - unsigned int m_spectrumCount; + + unsigned int m_peptideSequenceCount; unsigned int m_peptideIonCount; - unsigned int m_strippedPeptideCount; - unsigned int m_charge1Count; - unsigned int m_charge2Count; - unsigned int m_charge3Count; + unsigned int m_peptideSpectrumCount; + + vector<unsigned int> m_chargeCount; unsigned int m_trypticCount; unsigned int m_semitrypticCount; unsigned int m_nontrypticCount; - unsigned int m_icatClCount; - unsigned int m_icatUcCount; - unsigned int m_camCount; - unsigned int m_metOxCount; + + map<string, unsigned int> m_modsCount; + unsigned int m_nonPeptideCount; + unsigned int m_nonPeptideIonCount; + unsigned int m_nonPeptideSpectrumCount; unsigned int m_prob9999; unsigned int m_prob999; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |