Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo


Commit [r24] Maximize Restore History


acenglishbcm 2014-03-13

added /LabNotebook
added /LabNotebook/page2.txt
added /bin/massivePhrap.py
added /bin/mergingCalls.py
added /packages/PBSuite_14.3.7.tgz
added /src/dev
added /src/dev/mergingCalls.py
changed /setup.sh
changed /src/database/indices.sql
changed /src/database/schema.sql
changed /src/parliament/analysis/addAnnot.py
changed /src/parliament/analysis/ovlCalls.py
changed /src/parliament/analysis/utils.py
changed /src/parliament/io/BedIO.py
changed /src/parliament/io/DBIO.py
changed /src/parliament/io/parsers/BNGLoader.py
changed /src/parliament/io/parsers/SVachraLoader.py
copied /LabNotebook.txt -> /LabNotebook/page1.txt
copied /LabNotebook.txt -> /src/dev/massivePhrap.py
/LabNotebook/page2.txt Diff Switch to side-by-side view
/bin/massivePhrap.py Diff Switch to side-by-side view
/bin/mergingCalls.py Diff Switch to side-by-side view
/packages/PBSuite_14.3.7.tgz Diff Switch to side-by-side view
/src/dev/mergingCalls.py Diff Switch to side-by-side view
/setup.sh Diff Switch to side-by-side view
/src/database/indices.sql Diff Switch to side-by-side view
/src/database/schema.sql Diff Switch to side-by-side view
/src/parliament/analysis/addAnnot.py Diff Switch to side-by-side view
/src/parliament/analysis/ovlCalls.py Diff Switch to side-by-side view
/src/parliament/analysis/utils.py Diff Switch to side-by-side view
/src/parliament/io/BedIO.py Diff Switch to side-by-side view
/src/parliament/io/DBIO.py Diff Switch to side-by-side view
/src/parliament/io/parsers/BNGLoader.py Diff Switch to side-by-side view
/src/parliament/io/parsers/SVachraLoader.py Diff Switch to side-by-side view
/LabNotebook.txt to /LabNotebook/page1.txt
File was copied or renamed.
/LabNotebook.txt to /src/dev/massivePhrap.py
--- a/LabNotebook.txt
+++ b/src/dev/massivePhrap.py
@@ -1,164 +1,155 @@
+import sys, os
+import pysam
+from pbsuite.utils.CommandRunner import *
+from tempfile import NamedTemporaryFile
-#Need to describe filtering
-0 | Illumina
-1 | PacBio
-2 | Nextera
-3 | BioNano
-4 | Array
-5 | SOLiD
-6 | Claudia (Confirmed Sanger Sequences)
-NAME		  | Abbrev
-breakdancer_1.4.3 | BD
-Crest_1.0         | C
-cnvnator_v.0.2.7  | CNV
-delly_v0.0.9      | D
-Pindel_v0.2.4t    | P
-svstat-0.0.5-vb10 | S
-Tiresias          | TI
-Honey             | HON
-SVachra-v1.5      | NX
-BioNanoGenomics   | BNG
-JRLArray          | JAR
-JRLSequencing     | JSO
+region = sys.argv[1]
+bam1 = sys.argv[2]
+bam2 = sys.argv[3]
+#I'm going to trim the edges, too
+def getTag(read, tagId):
+    """
+    Returns the specified tag or None from an AlignmentRecord
+    """
+    for i in read.tags:
+        if i[0] == tagId:
+            return i[1]
+    return None
-After running our 13 Programs on 7 Sources, we get 54424 calls. (13 Results)
+def fetchReads(region, bamName, trim=False):
+    """
+    Trim is for pacbio reads that have tails that were attempted to 
+    be remapped -- helps reduce redundancy
+    Also, I'm going to read tails that extend beyond my boundaries
+    """
+    print "fetching %s from %s" % (bamName, region)
+    chr, pos = region.split(':')
+    start, end = [int(x) for x in pos.split('-')]
+    bam = pysam.Samfile(bamName)
+    ret = []
+    toQual = lambda y: " ".join([str(ord(x)-33) for x in y])
+    for id, read in enumerate(bam.fetch(region=region)):
+        name = read.qname 
+        name += " DIRECTION: rev" if read.is_reverse else " DIRECTION: fwd"
+        seq, qual = read.seq, read.qual 
+        if trim and not read.is_unmapped:
+            regTrim = 0
+            upS = read.cigar[0][1] if read.cigar[0][0] == 4 else 0
+            dnS = read.cigar[-1][1] if read.cigar[-1][0] == 4 else 0
+            trimS = -1
+            trimE = -1
+            if start > read.pos:
+                for queryPos, targetPos in read.aligned_pairs:
+                    if trimS == -1 and targetPos >= start:
+                        trimS = queryPos
+            if end < read.aend:
+                for queryPos, targetPos in read.aligned_pairs[::-1]:
+                    if trimE == -1 and targetPos <= end:
+                        trimE = queryPos
+            trimS = max(0, trimS)
+            trimE = min(len(read.seq), trimE)
+            seq = seq[trimS:trimE]
+            qual = qual[trimS:trimE]
+            print "regionTrim", trimS + (len(read.seq) - trimE)
+            if read.flag & 0x01 and False:
+                #if it has a mapped tail, trim it
+                if read.cigar[0][0] == 4: 
+                    t = False
+                    if read.is_reverse and getTag(read, "ER") is not None and upS == 0:
+                        t = True
+                    elif not read.is_reverse and getTag(read, "PR") is not None and upS == 0:
+                        t = True
+                    if t:
+                        seq = seq[read.cigar[0][1]:]
+                        qual = qual[read.cigar[0][1]:]
+                        tailTrim += read.cigar[0][1]
+                if read.cigar[-1][0] == 4:
+                    t = False
+                    if read.is_reverse and getTag(read, "PR") is not None and dnS == 0:
+                        t = True
+                    elif not read.is_reverse and getTag(read, "ER") is not None and upS == 0:
+                        t = True
+                    if t:
+                        seq = seq[:len(seq)-read.cigar[-1][1]]
+                        qual = qual[:len(qual)-read.cigar[-1][1]]
+                        tailTrim += read.cigar[-1][1]
+        ret.append([name, seq, toQual(qual)])
+    print "%d reads retreived" % len(ret)
+    return ret
+#Grabbing my reads
+print "grab"
+reads = fetchReads(region, bam1)
-	We're starting here because this is the easiest place.
-	Here we can see how many events each program is calling and how frequently they're calling
-	each reduced type.
-	Honey is making the most calls and JRL(claudia's confirmed 42) is obviously making the least (they're all DEL)
-	Looking into the size of the calls on a per-program basis, we have this way too busy but kinda informative
-	histogram. You can see that most of our calls are 1kb or less, there are a good number of calls over events that
-	are between 1kb and 10kb. And then a relatively small set of calls that are >10kb.
-If we look at how many of the calls from each of the results intersect with known events, we see
-	Prog	NumAnnot  NumCal  Pct
-	BD.0    1138      6256    0.181905370844
-	BNG.3   7         422     0.0165876777251
-	C.0     1815      3049    0.595277140046
-	CNV.0   410       3075    0.133333333333
-	D.0     1053      6946    0.151598042039
-	HON.1   898       12807   0.0701179042711
-	JAR.4   93        993     0.0936555891239
-	JRL.6   39        42      0.928571428571
-	JSO.5   151       926     0.163066954644
-	NX.2    60        6330    0.00947867298578
-	P.0     1900      7424    0.255926724138
-	S.0     1315      4700    0.279787234043
-	TI.0    24        1454    0.0165061898212
-An interesting result here is that there are 3 JRLs without Knowns. I looked at them manually and saw that this isn't
-really the fault of my intersection technique, but because the data for these isn't in DGV.A
-Talking to Christine Beck about this, she says,
-	"...having ~1/10-1/20 calls not in DGV isn��t too astounding to me...
-	If there��s nothing going on in the DGV track, can you check the same 
-	loci in HG18 and the HGSV track? Or the Conrad Nature paper on Arrays 
-	and Variants? They may have been seen using other technologies"
-A.k.a, our Known AnnotEntries needs to be extended, but for now, this gives us an idea of how well the annotation
-pipeline works. It gets most stuff, but just becasue something isn't gotten doesn't mean we don't have good information
+reads.extend(fetchReads(region, bam2, trim=True))
+print "write"
+fout = open("reads.fasta",'w')
+qout = open("reads.fasta.qual", 'w')
+for name, seq, qual in reads:
+    fout.write(">%s\n%s\n" % (name, seq))
+    qout.write(">%s\n%s\n" % (name, qual))
+print "phrap"
+r, o, e = exe("/hgsc_software/gen-asm/phrap/phrap reads.fasta -minmatch 6")
+r, o, e = exe("/hgsc_software/gen-asm/phrap/phrap reads.fasta.problems -minmatch 6")
+r, o, e = exe("cat reads.fasta.contigs reads.fasta.problems.contigs > asm.fasta")
+print "mapping"
+r, o, e = exe(("blasr asm.fasta /hgsc_software/pacbioSMRTAnalysis/smrtanalysis/current/common/references/human_g1k_v37/sequence/human_g1k_v37.fasta "
+               "-sa /hgsc_software/pacbioSMRTAnalysis/smrtanalysis/current/common/references/human_g1k_v37/sequence/human_g1k_v37.fasta.sa "
+               "-clipping soft -sam -bestn 1 -out remap.sam"))
+#Honey pie
-First, let's look at what happens if we try to reduce the number of thins we're looking at by grouping
-these calls via overlaps and creating chains
-In total, there are 4,109 Chains that are composed of 14,775 calls (27.1%)
-Of these 4k chains,
-	How many calls are in each Chain?
-	As you can see, most overlaps are just between a few of the programs. Not every method is capable of picking up
-	the every event in the same places. 
-	Trying to combine the types inside a chain,
-	Sometimes they conflict (even with our reduced type) which is why we have the mix
-	Mix !Del is some kind of insertion/translocation where we just have poor annotation on it
-	Mix DEL is likely one of the programs being dumb in it's annotation
-	Blocks are sorted from most to least number of results in the chain (bottom to top)
-	You can see that the larger blocks are in the middle or bottom, which is what we'd expect, it's hard to have
-	a lot of programs calling the same thing frequently, instead, it's just going to be a few agreeing
-	What's the average size of the calls within a chain?
-	To look at exactly how frequently each program is overlapping with any other program, we have the following 
-	heat map
-How many chains are annotated?
-	Cat |#Chain | pct of chains | #Calls | pct of chain'd calls
-	----+-------+---------------+--------+---------------
-	yes | 2273  | 0.55317595522 |  8124  | 0.549847715736
-        no  | 1836  | 0.44682404478 |  6651  | 0.450152284264
-Now, Overlap isn't really the best way to merge the calls because not every program/source is created equal. BNG can
-only get resolution at a very large scale, so it's breakpoints aren't going to line up very well with, say, a Pindel
-call. Therefore, we will cluster the calls. Clustering is a greedy grouping approach where if even a
-single base between calls overlaps in the reference space, we combine them. Because of how greedy this is, it's possible
-that we have two calls in a cluster together when they don't actually touch because they're both touching a third call.
-##working HERE##
-With this breakdown, we create 6,423 clusters with 31,345 calls (this is going to be more, but that's okay because we
-are loosening our requirements)
-	First of all, a cluster can contain any number of calls because of how we definine it as a call's span. meaning
-	a call by it's self is technically in a cluster.
-	So, let's look first at all of the clusters that are by themselves
-	In total there are 23,079 totally isolated calls
-	Now, we are just going to leave these solo calls where they are for now. Let's look at things that we can
-	consider because there's a little bit of redundancy in the data sets
-	Number of calls in a cluster
-##working HERE##
-	How many overlaps do we have per cluster? (better be zeros and ones (twos means we have something big joining
-	two distinct clusters)
-	with this being so overwhelmingly 2s, you might want to say that clusters are strongly correlated with overlaps,
-	and so each grouping method accurately captures a single event in an locus. 
-	however, if we also need to look at how many calls there are which aren't in chains on a per cluster basis
-	Heat map showing how many of the clusters have x Calls in overlaps and y Calls that are solo.
-	The idea is that we should be able to show that if you are a cluster, everything is overlapping.
-	However, This doesn't seem to be true. the largest percentage (~32%) of our clusters are 2 solo calls that don't
-	have overlap. There's also a good percent with 3 calls that don't overlap and a few with 4.
-	We do have a good number (maybe 1k) that have 0 or 1 'loose calls' and 2or3 calls that chain. These are likely
-	informative
-	Same type reduction as Overlaps on Cluster
-	Average Call Size
-	We could have a mix of sizes, so let's see if there is a difference
-	Already made in dbGenViz
-Consider a locus all of the information we just presented - a cluster of calls, that will contain overlap chains, and
-properties such as GCpct and Mappability, and known annot
-up to this point, we've used isAnnot as our a type of truth metric, and we've explored why it's incomplete (the known is
-incomplete, the method isn't perfect) and we can't very well say that half of our overlaps are bad
+for pos,i in enumerate(fh.readline().strip().split('\t')):
+    exec("%s=%d" % (i, pos))
+for line in fh.readlines():
+    data = line.strip().split('\t')
+    I say we make all of this into a valence protocol -- just copy what we did earlier for the charge pool
+    I also want several pileups of the region.
+    Once I make all of this auto-magically, I will work on how to figure out if they are real at all
+    I will not worry about splitting it out for now
+    tempName = "groupId%s" % data[groupId]
+    samtools view %s:%d-%d % (data[chrom]:data[start]-data[end]) | I need to get this and split...
+    Install Honey in the virtual env...
+    pull just fastq information  and create
+        groupid.fasta - groupid.fasta.qual
+        /hgsc_software/gen-asm/phrap/phrap groupid.fasta -minmatch 6 -forcelevel 10 > out 2>&1 &
+        /hgsc_software/pacbioSMRTAnalysis/smrtanalysis/current/analysis/bin/blasr asm.fasta
+            /stornext/snfs0/next-gen/pacbio/data/references/human_g1k_v37/sequence/human_g1k_v37.fasta -sa
+            /stornext/snfs0/next-gen/pacbio/data/references/human_g1k_v37/sequence/human_g1k_v37.fasta.sa -bestn 1 -nproc 4
+        -out blasrAlign.sam -sam
+        Honey.py pie blasrAlign.sam
+        ~/english/Jelly/DevJelly/trunk/bin/sam2bam
+        /stornext/snfs5/next-gen/Illumina/bwa_references/h/hg19/original/hg19.fa phrapPB.sam
+        samtools mpileup 
+            within region is there evidence, is it het