Menu

ctssTable

Timo Lassmann

% ctssTable
% Timo Lassmann
% March 5th 2014

Introduction

ctssTable extracts the raw ctss counts from a tome database and prints them out in a table.

Usage

$ ctssTable  <database name>  -sample <sample1>,<sample2>,<sample...> -out <output.txt>

Output

The output is a tab separated file listing:

  1. Chromosome
  2. Position
  3. Strand
  4. CTSS counts in sample1
  5. CTSS counts in sample2
  6. CTSS counts in sample...

File: ctsstable.c

/*
 tomeTools - a collection of programs to work with CAGE data.
 Author: Timo Lassmann <timolassmann@gmail.com>
 Copyright (C) 2014 RIKEN

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include<sys/stat.h>

#if HAVE_CONFIG_H
#include <config.h>
#endif

#include "tomedb.h"

#define OPT_SAMPLE 1

double* get_sample_data_from_matrix(struct  crf* crf,double* column,int* use_sample , int tracks,int strand,int x);

void usage();

int main (int argc, char * argv[])
{
    int i,c,f,g;
    char* tmp = 0;
    int count;

    int num_samples = 0;
    char** sample_names = 0;
    int help = 0;
    int quiet = 0;
    char* outfile = 0;

    while (1){
        static struct option long_options[] ={
            {"sample",required_argument,0,OPT_SAMPLE},
            {"out",required_argument,0, 'o'},
            {"quiet",0,0,'q'},
            {"help",0,0,'h'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long_only (argc, argv,"o:qh:",long_options, &option_index);

        if (c == -1){
            break;
        }

        switch(c) {
            case 0:
                break;
            case OPT_SAMPLE:
                tmp = optarg;
                count = byg_count(",", tmp);
                //fprintf(stderr,"%d\n",count);
                num_samples = count+1;
                sample_names = malloc(sizeof(char*) * (count+1));
                for(i = 0; i < count+1;i++){
                    sample_names[i] = malloc(sizeof(char)* 1000);
                }
                f = 0;
                g = 0;
                for(i = 0;i < strlen(tmp);i++){
                    if(tmp[i] != ','){
                        if(tmp[i] != ' '){
                            sample_names[f][g] = tmp[i];
                        }
                        g++;
                    }else{
                        sample_names[f][g] = 0;
                        f++;
                        g = 0;
                    }
                }
                sample_names[f][g] = 0;

                break;
            case 'q':
                quiet = 1;
                break;
            case 'h':
                help = 1;
                break;

            case 'o':
                outfile = optarg;
                break;
            case '?':
                exit(1);
                break;
            default:
                fprintf(stderr,"default\n\n\n\n");
                abort ();
        }
    }
    char* db_file = 0;

    if(help){
        usage();
        exit(EXIT_SUCCESS);
    }

    db_file = argv[optind++];
    if(!db_file){
        fprintf(stderr,"Error: no database file...\n");
        exit(EXIT_FAILURE);
    }

    if(num_samples == 0){
        fprintf(stderr,"Error: no samples specified..\n");
        exit(EXIT_FAILURE);
    }

    TOMEDB* db = 0;

    db = main_read_db(db,db_file);

    struct tome_data* td = init_tome_data(100,db->num_tracks );

    td->us =  init_use_samples(db,sample_names,0, num_samples ,0);

    FILE* out = 0;
    int blob_length = 0;
    unsigned char *blob_data = 0;
    struct crf * crf = 0;
    const char *zSql = "SELECT * FROM tome_data;";
    sqlite3_stmt *pStmt;
    sqlite3 *sqlite_db;
    int rc;
    blob_length = 0;
    blob_data = 0;

    rc = sqlite3_open(db_file, &sqlite_db);

    char* chr = 0;
    unsigned int pos = 0;
    unsigned int offset = 0;

    double* column = malloc(sizeof(double)* (db->num_tracks+1));

    if(outfile){
        if ((out = fopen(outfile, "w")) == NULL){
            fprintf(stderr,"can't open output\n");
            exit(EXIT_FAILURE);
        }
    }else{
        out = stdout;
    }

    fprintf(out,"Chromosome\tPosition\tStrand");
    for(i = 0; i < db->num_tracks;i++){
        if(td->us->use_sample[i]){
            fprintf(out,"\t%s",td->us->sample_names[i]);
        }
    }
    fprintf(out,"\n");

    rc = sqlite3_prepare(sqlite_db, zSql, -1, &pStmt, 0);
    if( rc!=SQLITE_OK ){
        fprintf(stderr,"could not establish database connection.\n");
        exit(EXIT_FAILURE);
    }
    while ( sqlite3_step(pStmt)  == SQLITE_ROW){
        rc = sqlite3_column_int(pStmt,0);
        blob_length = sqlite3_column_bytes(pStmt, 1);
        blob_data = (unsigned char *)malloc(blob_length);
        memcpy(blob_data, sqlite3_column_blob(pStmt, 1), blob_length);

        blob_data  = unrle(blob_data,&blob_length);

        crf = make_crf_from_blob(crf, blob_data,blob_length,db->num_tracks);
        free(blob_data);
        crf->genome_pos = rc;

        for(i= 0; i < TOMEDB_DB_BLOCK_SIZE;i++){
            column = get_sample_data_from_matrix(crf,column, td->us->use_sample , db->num_tracks,0,i);
            if(column[num_samples] != 0.0){
                pos = (crf->genome_pos << 15u) +  i;
                chr = get_chr_name(db, pos );
                offset = get_chr_offset (db->seq_info,chr);
                fprintf(out, "%s\t%d\t+\t",chr, pos - offset);

                for(c = 0;c < num_samples-1;c++){
                    fprintf(out,"%d\t",(int)column[c]);
                    column[c] = 0.0;
                }
                fprintf(out,"%d\n",(int)column[num_samples-1]);
                column[num_samples-1] = 0.0;
            }

        }

        free_crf(crf);
        crf = 0;
    }
    sqlite3_finalize(pStmt);

    rc = sqlite3_prepare(sqlite_db, zSql, -1, &pStmt, 0);
    if( rc!=SQLITE_OK ){
        fprintf(stderr,"could not establish database connection.\n");
        exit(EXIT_FAILURE);
    }
    while ( sqlite3_step(pStmt)  == SQLITE_ROW){
        rc = sqlite3_column_int(pStmt,0);
        blob_length = sqlite3_column_bytes(pStmt, 1);
        blob_data = (unsigned char *)malloc(blob_length);
        memcpy(blob_data, sqlite3_column_blob(pStmt, 1), blob_length);

        blob_data  = unrle(blob_data,&blob_length);

        crf = make_crf_from_blob(crf, blob_data,blob_length,db->num_tracks);
        free(blob_data);
        crf->genome_pos = rc;

        for(i= 0; i < TOMEDB_DB_BLOCK_SIZE;i++){

            column = get_sample_data_from_matrix(crf,column, td->us->use_sample , db->num_tracks,1,i);
            if(column[num_samples] != 0.0){
                pos = (crf->genome_pos << 15u) +  i;
                chr = get_chr_name(db, pos );
                offset = get_chr_offset (db->seq_info,chr);
                fprintf(out, "%s\t%d\t-\t",chr, pos - offset);
                for(c = 0;c < num_samples-1;c++){
                    fprintf(out,"%d\t",(int)column[c]);
                    column[c] = 0.0;
                }
                fprintf(out,"%d\n",(int)column[num_samples-1]);
                column[num_samples-1] = 0.0;
            }

        }

        free_crf(crf);
        crf = 0;
    }
    sqlite3_finalize(pStmt);

    //fclose(file);

    sqlite3_close(sqlite_db);

    free_tome_data(td);

    free_db(db);

    if(outfile){
        fclose(out);
    }

    for(i = 0; i < num_samples;i++){
        free(sample_names[i]);
    }
    free(sample_names);

    return EXIT_SUCCESS;
}

double* get_sample_data_from_matrix(struct  crf* crf,double* column,int* use_sample , int tracks,int strand,int x)
{
    int i,j;
    int f = 0;
    double sum = 0;
    if(!strand){
        f = 0;
        for(i = 0; i < tracks;i++){
            if(use_sample[i]){
                column[f] = 0;
                if(i < crf->num_plus_tracks){
                    for(j = crf->plus_row_ptr[i]; j < crf->plus_row_ptr[i+1]; j++){
                        if(crf->plus_col_index[j] == x ){
                            column[f] = crf->plus_values[j];
                            sum +=crf->plus_values[j];
                        }

                    }
                }
                f++;

            }
        }
    }else{
        f = 0;
        for(i = 0; i < tracks;i++){
            if(use_sample[i]){
                column[f] = 0;
                if(i < crf->num_minus_tracks){
                    for(j = crf->minus_row_ptr[i]; j < crf->minus_row_ptr[i+1]; j++){

                        if(crf->minus_col_index[j] == x){
                            column[f] = crf->minus_values[j];
                            sum +=crf->minus_values[j];
                        }

                    }
                }

                f++;
            }
        }
    }
    column[f] = sum;
    return column;
}

void usage()
{
    fprintf(stdout, "\n%s %s, Copyright (C) 2014 Riken <%s>\n",PACKAGE_NAME, PACKAGE_VERSION,PACKAGE_BUGREPORT);
    fprintf(stdout, "\n");
    fprintf(stdout, "Usage:   ctssTable [options] <db> -sample <sample1>,<sample2>,<sample...> \n\n");

    fprintf(stdout, "Options:\n");

    fprintf(stdout, "         -o         STR     output file prefix.\n");
    fprintf(stdout, "\n");

}

Related

Wiki: Home