robertthebob2 - 2015-01-23

Here is my Makefile to run BALSA, but I'm blocked at running the
alignment by a CUDA MALLOC error, so the Makefile is not completely debugged.

# Makefile for Balsa Cuda analysis of mm10 sequences
# FQLINES in each split fastq file, multiple of four, for pair-multi
FQLINES=20000000
BPATH=~/Downloads/balsax/balsa/balsa_cuda_6.5_GTX9x0
REF=reference/mm10.fa
# $REFI is a just prefix and no file of that name should exist. 
REFI=$(REF).index
REFILIST=$(REFI).amb $(REFI).ann $(REFI).bwt $(REFI).fmv \
$(REFI).lkt $(REFI).pac $(REFI).rev.bwt $(REFI).rev.fmv \
$(REFI).rev.lkt $(REFI).rev.pac $(REFI).sa $(REFI).tra

REFIGPULIST=$(REFI).fmv.mmap  $(REFI).rev.fmv.mmap $(REFI).pac.mmap
VCFPATH=indels-snps
IVCF=mgp.v4.indels.dbSNP.vcf
SVCF=mgp.v4.snps.dbSNP.vcf
FASTQD=fastq
# fastq file names without the .fastq extension
# (R1_001,   not R1_001.fastq)
R1=index39_CTATAC_L001-L002_R1_001
R2=index39_CTATAC_L001-L002_R2_001
SOAPPATH=~/Downloads/soap3-dp-5.5

# Obtain the gene region list manually from UCSC browser - whole gene, known genes BED file
GRL=mm10-gene-region-list

# Should not have to change anything below this point.

soap3-dp-builder.ini : $(SOAPPATH)/soap3-dp-builder.ini
    ln -sf  $(SOAPPATH)/soap3-dp-builder.ini .

#Build the 2BWT index.
$(REFILIST) :  soap3-dp-builder.ini $(REF)
    time $(SOAPPATH)/soap3-dp-builder $(REF)

#Convert the 2BWT index to the GPU-2BWT index, just a few seconds
$(REFGPULIST) : $(REFILIST)
    time $(SOAPPATH)/BGS-Build $(REFI)

# Compute SnpDB index,  
#   $(BPATH)/SnpDBBuilder $(REFI) <VCF> <Output Prefix>
mm10.snpDB.1list mm10.snpDB.bv snpdbtest : $(REFILIST)  $(VCFPATH)/$(SVCF)
    time $(BPATH)/SnpDBBuilder $(REFI) $(VCFPATH)/$(SVCF) mm10.snpDB

# Compute IndelDB index, about 20 sec for mm10 
# IndelDBBuilder <2BWT Index> <VCF> <Output file name>
mm10.idelDB : $(REFILIST)  $(VCFPATH)/$(IVCF)
    time $(BPATH)/IndelDBBuilder $(REFI) $(VCFPATH)/$(IVCF) mm10

# Download $SVCF.gz from ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v4.snps.dbSNP.vcf.gz
$(VCFPATH)/$(SVCF) svcf :  
    wget ftp://ftp-mouse.sanger.ac.uk/current_snps/$(SVCF).gz
    gunzip $(SVCF).gz
    mv $(SVCF) $(VCFPATH)

# Download $IVCF.gz from ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v4.indels.dbSNP.vcf.gz
$(VCFPATH)/$(IVCF) ivcf :  
    wget ftp://ftp-mouse.sanger.ac.uk/current_snps/$(IVCF).gz
    gunzip $(IVCF).gz
    mv $(IVCF) $(VCFPATH)

# Build region index from the list file obtained with the UCSC genome browser
# with "known genes" BED file.   Takes just a few seconds
# RegionIndexBuilder <2BWT Index> <Input region list file> <Output List File> <Input File Format>
$(GRL).index : $(GRL).bed
    time $(BPATH)/RegionIndexBuilder $(REFI) $(GRL).bed $(GRL).index -bed

#Align the unpaired reads to the genome
# make sam output (-b 2)   bam (-b 3)
# balsa pair       <2BWT Index> <dbSnp Index> <Indel DB> <General Region List> <First FASTQ File> <Second FASTQ FILE> <result Prefix> [options]
# balsa pair-multi <2BWT Index> <dbSnp Index> <Indel DB> <General Region List> <Read File Pairs List> <result Prefix> [options]
reads_1.fq.out: soap3-dp.ini pairs.list
    time $(BPATH)/balsa pair-multi $(REFI) mm10.snpDB mm10.indelDB mm10-gene-region-list.index pairs.list result -b 3  

pairs.list : $(FASTQD)/$(R1).fastq $(FASTQD)/$(R2).fastq
    cd $(FASTQD); split -l $(FQLINES) $(R1).fastq $(R1).split
    cd $(FASTQD); split -l $(FQLINES) $(R2).fastq $(R2).split
    ls $(FASTQD)/$(R1).split* > $(FASTQD)/r1.list
    ls $(FASTQD)/$(R2).split* > $(FASTQD)/r2.list
    paste $(FASTQD)/r1.list $(FASTQD)/r2.list > $(FASTQD)/files.list
    perl -pe 's@\n@\t400\t600\tresult\n@' $(FASTQD)/files.list > pairs.list

clean.pairs.list : 
    rm -f  $(FASTQD)/*split* $(FASTQD)/files.list  $(FASTQD)/r?.list
    rm -f pairs.list
 

Last edit: robertthebob2 2015-01-26