Menu

tomeDB

Timo Lassmann

% tomeDB
% Timo Lassmann
% December 24th 2013

Introduction

tomeDB is a database engine designed to efficiently store and retrive Capped analysis of gene expression (CAGE) data. Under the hood it implements a compressed sparse matrix1 over the entire genome. To allow for fast random access the matrix is split up into blocks, each saved as an entry in a sqlite database.

Usage

Creating a new database

TomeDB stores the genome sequence in addition to genomic tracks. To create a new database simply:

$ tomedb create <database name> <genome.fa>

Where database name is the prefix used for various database files. The genome.fa is a multi-fasta file containing the reference genome.

Importing CAGE data

TomeDB can import CAGE ctss data in bed format like this:

chr1    567561  567562  wgEncodeRikenCageHpiepcCellPapAlnRep1   8       +
chr1    567572  567573  wgEncodeRikenCageHpiepcCellPapAlnRep1   1       +
chr1    567573  567574  wgEncodeRikenCageHpiepcCellPapAlnRep1   1       +
chr1    567575  567576  wgEncodeRikenCageHpiepcCellPapAlnRep1   10      +
chr1    567576  567577  wgEncodeRikenCageHpiepcCellPapAlnRep1   23      +

The fifth score column counts the number of CAGE reads stating at the given nucleotide. The script makectss.sh can be used to generate the above file from aligned CAGE reads in bam format.

To import ctss files simply:

$ tomedb import <database name> <lib1.ctss.bed> <lib2.ctss.bed>

WARNING: the import is not optimized yet. Loading thousands of tracks may take hours.

Remove tracks from a database.

It is possible to remove all tracks stored in a database like this:

$ tomedb clean <database name>

However, the index to the reference genome sequence will remain.

Reporting on the content of the database.

$ tomedb stats <database name>

Will list the names of all stored tracks and some basic statistics on them.

File: tomedatabase.c

Below is the wrapper code to call the various database functions. Not very interesting.

/*
 tomeTools - a collection of programs to work with CAGE data.
 Author: Timo Lassmann <timolassmann@gmail.com>
 Copyright (C) 2013 RIKEN

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include "tomedb.h"
#include "tomedb_nuc.h"

#if HAVE_CONFIG_H
#include <config.h>
#endif

void usage();

int main (int argc, char * argv[])
{
    int c;
    int help = 0;
    int quiet = 0;



    while (1){
        static struct option long_options[] ={
            {"quiet",0,0,'q'},
            {"help",0,0,'h'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long_only (argc, argv,"o:qh:",long_options, &option_index);

        if (c == -1){
            break;
        }

        switch(c) {
            case 0:
                break;
            case 'q':
                quiet = 1;
                break;
            case 'h':
                help = 1;
                break;
            case '?':
                exit(1);
                break;
            default:
                fprintf(stderr,"default\n\n\n\n");
                abort ();
        }
    }
    char* command = 0;
    char** infile = 0;
    int num_infiles = 0;

    char* db_file = 0;

    if(help){
        usage();
        exit(EXIT_SUCCESS);
    }

    command = argv[optind++];
    if(!command){
        usage();
        exit(EXIT_SUCCESS);
    }


    db_file = argv[optind++];
    if(!db_file){
        fprintf(stderr,"No database name specified...\n");
        usage();
        exit(EXIT_FAILURE);
    }

    infile = malloc(sizeof(char*)* (argc-optind ));

    c = 0;
    while (optind < argc){
        infile[c] =  argv[optind++];
        c++;
    }
    num_infiles = c;

    init_nuc_code();

    TOMEDB* db = 0;

    if(strcmp(command, "create") == 0){
        if(db_exists (db_file)){
            fprintf(stderr,"A database with the same name already exists.\n");
            return EXIT_FAILURE;
        }
        db = init_db(db,db_file);
        db = add_seq_info(db,infile[0] ,db_file);
        create_sqlite_table(db);
        main_write_db(db);
    }

    if(strcmp(command, "import") == 0){
        if(!num_infiles){
            fprintf(stderr,"No files for import found\n");
            return EXIT_FAILURE;
        }
        db = main_read_db(db,db_file);
        db = import_tome_data(db,infile, num_infiles);
        main_write_db(db);
    }

    if(strcmp(command, "clean") == 0){
        db = main_read_db(db,db_file);
        nuke_sqlite_table(db);
        main_write_db(db);
    }

    if(strcmp(command,"stats")==0){
        db = main_read_db(db,db_file);
        fprintf(stderr,"%d\n",db->num_tracks);
        for(c = 0; c < db->num_tracks;c++){
            fprintf(stdout,"%d\t%s\t%d\n",c,db->track_names[c],db->track_totals[c]);
        }
    }
    if(db){
        free_db(db);
    }
    return EXIT_SUCCESS;
}


void usage()
{
    fprintf(stdout, "\n%s %s, Copyright (C) 2013 Riken <%s>\n",PACKAGE_NAME, PACKAGE_VERSION,PACKAGE_BUGREPORT);
    fprintf(stdout, "\n");
    fprintf(stdout, "Usage:   tomedb [command] [options]\n\n");

    fprintf(stdout, "Commands:\n");

    fprintf(stdout, "         tomedb create <dbname> <genome.fa>       - creates a database including the an index of the genome.\n");
    fprintf(stdout, "         tomedb import <dbname> <trackname.bed>   - imports data. In case of a bed file tome assumes the counts are in column 5\n");
    fprintf(stdout, "         tomedb clean <dbname>                    - removes all imported track data from the database. \n");
    fprintf(stdout, "         tomedb stats <dbname>                    - prints stats on imported tracks.\n");

}

File: makectss.sh

This scripts extracted the most five prime positions from mapped CAGE reads, counts them and prints out a bed file. Each entry (row) should correspond to a single nucleotide position. The fifth column is used / abused to store the number of reads.

#!/bin/bash

if [ $# -eq 0 ]
then
cat <<EOF
Usage is : $0 -q <mapping quality cutoff> <map1.bam> <map2.bam> ...
EOF
exit 1;
fi

QCUT=

while getopts q: opt
do
case ${opt} in
q) QCUT=${OPTARG};;
*) usage;;
esac
done

if [ "${QCUT}" = "" ]; then QCUT=10; fi

for var in "$@"
do
if [[ $var =~ bam$ ]]; then
base=${var%%.*}
echo working on: $base

TMPFILE="/tmp/$(basename $0).$RANDOM.txt"
samtools view  -F 4 -u -q $QCUT -b $var |  bamToBed -i stdin > $TMPFILE
cat  ${TMPFILE} \
| awk 'BEGIN{OFS="\t"}{if($6=="+"){print $1,$2,$5}}' \
| sort -k1,1 -k2,2n \
| groupBy -i stdin -g 1,2 -c 3 -o count \
| awk -v x="$base" 'BEGIN{OFS="\t"}{print $1,$2,$2+1,  x  ,$3,"+"}' >> $var.ctss.bed

cat  ${TMPFILE} \
| awk 'BEGIN{OFS="\t"}{if($6=="-"){print $1,$3,$5}}' \
| sort -k1,1 -k2,2n \
| groupBy -i stdin -g 1,2 -c 3 -o count \
| awk -v x="$base" 'BEGIN{OFS="\t"}{print $1,$2-1,$2, x  ,$3,"-"}' >> $var.ctss.bed

rm $TMPFILE
fi
done

  1. Jon Bentley. Programming Pearls, Second Edition, Column 10: Squeezing space. 


Related

Wiki: Home