Menu

pairInteract

Timo Lassmann

% pairInteract
% Timo Lassmann
% December 24th 2013

Introduction

pairInteract is a program to find putative interactions between a list of regulators and a list of targets. It runs random forest based regression to explain the expression profile of a gene by the profiles of all regulators. Important variables are reported as putative interactors. In a second round of computation, pairInteract estimates the false discovery rate of all edges using random decision trees (RDT). In brief RDT is run on the top regulator and a randomized expression profile a thousand times each. For each iteration we assess whether there is is a statistically significant difference (Mann–Whitney U test p <= 0.05 ) in the distributions of tree based mean squared errors between real and randomized data. While the number of successful trial is below a given FDR threshold we continue adding the second best, third best etc.. regulator until the improvement by adding a regulator is above a give FDR threshold.

Usage

$ pairInteract  <database name>  <regulators.bed> <targets.bed> -out <output.txt> -norm unit -ntree 2000 -t 80

Output

The output is a tab separated file listing:

  1. regulator name
  2. target name
  3. gini based variance reduction
  4. oob estimate
  5. zscore
  6. estmated FDR

File: pair.c

/*
 pairInteract - a collection of programs to work with CAGE data.
 Author: Timo Lassmann <timolassmann@gmail.com>
 Copyright (C) 2013 RIKEN

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include "tomedb.h"
#include "cauldron.h"

#define OPT_SAMPLE 1
#define OPT_NORM 2

#define OPT_NUM_TREE 3

#define OPT_NUM_THREAD 4

#define OPT_MTRY 5

int main (int argc, char * argv[])
{
    struct random_forrest_data* rfd = 0;
    FILE* file = 0;
    int i,j,c,f,g;
    char* tmp = 0;
    int count;

    int input_num_samples = 0;
    char** sample_names = 0;
    char* norm = 0;
    int help = 0;
    int quiet = 0;
    int gzipped = 0;
    char* column_spec = 0;
    char* outfile = 0;

    int num_tree = 1000;
    int num_thread = 8;
    int mtry = 0;

    while (1){
        static struct option long_options[] ={
            {"sample",required_argument,0,OPT_SAMPLE},
            {"norm",required_argument,0,OPT_NORM},
            {"ntree",required_argument,0,OPT_NUM_TREE},
            {"nthread",required_argument,0,OPT_NUM_THREAD},
            {"mtry",required_argument,0,OPT_MTRY},
            {"out",required_argument,0, 'o'},
            {"quiet",0,0,'q'},
            {"help",0,0,'h'},
            {0, 0, 0, 0}
        };

        int option_index = 0;
        c = getopt_long_only (argc, argv,"o:qh:",long_options, &option_index);

        if (c == -1){
            break;
        }

        switch(c) {
            case 0:
                break;
            case OPT_MTRY:
                mtry = atoi(optarg);
                break;
            case OPT_NUM_TREE:
                num_tree = atoi(optarg);
                break;
            case OPT_NUM_THREAD:
                num_thread = atoi(optarg);
                break;

            case OPT_NORM:
                norm = optarg;
                break;
            case OPT_SAMPLE:
                tmp = optarg;
                count = byg_count(",", tmp);
                //fprintf(stderr,"%d\n",count);
                input_num_samples = count+1;
                sample_names = malloc(sizeof(char*) * (count+1));
                for(i = 0; i < count+1;i++){
                    sample_names[i] = malloc(sizeof(char)* 1000);
                }
                f = 0;
                g = 0;
                for(i = 0;i < strlen(tmp);i++){
                    if(tmp[i] != ','){
                        sample_names[f][g] = tmp[i];
                        g++;
                    }else{
                        sample_names[f][g] = 0;
                        f++;
                        g = 0;
                    }
                }
                sample_names[f][g] = 0;

                break;
            case 'q':
                quiet = 1;
                break;
            case 'h':
                help = 1;
                break;

            case 'o':
                outfile = optarg;
                break;
            case '?':
                exit(1);
                break;
            default:
                fprintf(stderr,"default\n\n\n\n");
                abort ();
        }
    }

    char* db_file = 0;
    char* reg_file = 0;
    char* tar_file = 0;

    db_file = argv[optind++];
    if(!db_file){
        fprintf(stderr,"No database file...\n");
        return EXIT_FAILURE;
    }

    reg_file = argv[optind++];
    if(!reg_file){
        fprintf(stderr,"No database file...\n");
        return EXIT_FAILURE;
    }

    tar_file = argv[optind++];
    if(!tar_file){
        fprintf(stderr,"No input file...\n");
        return EXIT_FAILURE;
    }

    TOMEDB* db = 0;

    db = main_read_db(db,db_file);

    int lines_read = 0;

    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////
    ////////////////    Maximum nuber of regulators or targets - all need to fit in memory for learning
    ////////////////
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    clock_t cStartClock;

    cStartClock = clock();

    //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////////
    ////////////////    Loading & normalizing data
    ////////////////
    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

    struct tome_data* td_regulators = init_tome_data(100000,db->num_tracks );

    file = open_track(reg_file, &gzipped);
    column_spec = get_column_specs_based_on_file_suffix(reg_file);

    lines_read = read_data_chunk(db,td_regulators,column_spec,file);

    free(column_spec);
    close_track(file, gzipped);

    if(lines_read ==100000){
        fprintf(stderr,"Warning: will only use the top %d lines from file %s\n", lines_read,reg_file);
    }

    td_regulators->us =  init_use_samples(db,sample_names,0, input_num_samples ,0);
    td_regulators = add_expression(db, td_regulators);

    //removes regions with expression in less than half of the selected samples...
    td_regulators = min_sample_filter(td_regulators, td_regulators->us->num_use_samples * 0.5);

    //removed the unused samples;
    td_regulators = remove_unused_samples(td_regulators);

    //normalize expresssiom
    td_regulators = norm_main(td_regulators,  norm);

    //copies pointers into the expression matrix
    td_regulators = make_expression_matrix(td_regulators);

    struct tome_data* td_targets = init_tome_data(100000,db->num_tracks );

    file = open_track(tar_file, &gzipped);
    column_spec = get_column_specs_based_on_file_suffix(tar_file);

    lines_read = read_data_chunk(db,td_targets,column_spec,file);

    free(column_spec);
    close_track(file, gzipped);

    if(lines_read ==100000){
        fprintf(stderr,"Warning: will only use the top %d lines from file %s\n", lines_read,tar_file);
    }

    fprintf(stderr,"READ:%d\n",lines_read);
    td_targets->us =  init_use_samples(db,sample_names,0, input_num_samples ,0);
    td_targets = add_expression(db, td_targets);
    fprintf(stderr,"READ:%d\n",td_targets->numseq);
    //removes regions with expression in less than half of the selected samples...
    td_targets = min_sample_filter(td_targets, td_targets->us->num_use_samples * 0.5);
    fprintf(stderr,"READ:%d\n",td_targets->numseq);

    //removed the unused samples;
    td_targets = remove_unused_samples(td_targets);
    fprintf(stderr,"READ:%d\n",td_targets->numseq);

    //normalize expresssiom
    td_targets = norm_main(td_targets,  norm);
    fprintf(stderr,"READ:%d\n",td_targets->numseq);

    //copies pointers into the expression matrix
    td_targets = make_expression_matrix(td_targets);
    fprintf(stderr,"READ:%d\n",td_targets->numseq);

    //////
    /// init random forest data ....
    /////

    rfd = malloc(sizeof(struct random_forrest_data));

    rfd->num_regulators = td_regulators->numseq;
    rfd->num_targets = td_targets->numseq;
    rfd->num_samples = td_targets->num_samples;
    rfd->num_tree = num_tree;
    rfd->num_threads = num_thread;

    if(!mtry){
        mtry = (int)(rfd->num_regulators/ 3.0);
    }
    rfd->mtry = mtry;

    rfd->regulators = malloc(sizeof(struct data_matrix));

    rfd->regulators->mat =  td_regulators->expression_matrix;
    rfd->regulators->row_names = malloc(sizeof(char*)  * td_regulators->numseq);
    for(i = 0; i < td_regulators->numseq;i++){
        rfd->regulators->row_names[i] = td_regulators->gc[i]->info;
    }

    rfd->targets = malloc(sizeof(struct data_matrix));
    rfd->targets->mat =  td_targets->expression_matrix;
    rfd->targets->row_names = malloc(sizeof(char*)  * td_targets->numseq);
    for(i = 0; i < td_targets->numseq;i++){
        rfd->targets->row_names[i] = td_targets->gc[i]->info;
    }

    rfd = create_rf_result(rfd);

    cStartClock = clock();
    rfd = start_rf_run(rfd);
    fprintf(stderr,"Completed RF in %4.2f seconds\n",(double)( clock() - cStartClock) / (double)CLOCKS_PER_SEC);
    cStartClock = clock();
    rfd = start_rdt_fdr_for_rd_results(rfd );
    fprintf(stderr,"Completed RDT in %4.2f seconds\n",(double)( clock() - cStartClock) / (double)CLOCKS_PER_SEC);

    for(i = 0; i < rfd->num_targets;i++){
        for(j = 0; j < rfd->num_regulators;j++){
            fprintf(stdout,"%s\t%s\t%f\t%f\t%f\t%f\n", td_regulators->gc[j]->info, td_targets->gc[i]->info,rfd->rf_results[i]->variance_reduction[j],rfd->rf_results[i]->oobestimate[j] ,  rfd->rf_results[i]->zscore[j],rfd->rf_results[i]->fdr[j] );
        }
    }

    free_tome_data(td_regulators);
    free_tome_data(td_targets);

    free_rf_result(rfd);
    free(rfd);

    free_db(db);

    return EXIT_SUCCESS;
}

Related

Wiki: Home