% pairInteract
% Timo Lassmann
% December 24th 2013
pairInteract is a program to find putative interactions between a list of regulators and a list of targets. It runs random forest based regression to explain the expression profile of a gene by the profiles of all regulators. Important variables are reported as putative interactors. In a second round of computation, pairInteract estimates the false discovery rate of all edges using random decision trees (RDT). In brief RDT is run on the top regulator and a randomized expression profile a thousand times each. For each iteration we assess whether there is is a statistically significant difference (Mann–Whitney U test p <= 0.05 ) in the distributions of tree based mean squared errors between real and randomized data. While the number of successful trial is below a given FDR threshold we continue adding the second best, third best etc.. regulator until the improvement by adding a regulator is above a give FDR threshold.
$ pairInteract <database name> <regulators.bed> <targets.bed> -out <output.txt> -norm unit -ntree 2000 -t 80
The output is a tab separated file listing:
/*
pairInteract - a collection of programs to work with CAGE data.
Author: Timo Lassmann <timolassmann@gmail.com>
Copyright (C) 2013 RIKEN
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdlib.h>
#include <stdio.h>
#include <getopt.h>
#include <string.h>
#include "tomedb.h"
#include "cauldron.h"
#define OPT_SAMPLE 1
#define OPT_NORM 2
#define OPT_NUM_TREE 3
#define OPT_NUM_THREAD 4
#define OPT_MTRY 5
int main (int argc, char * argv[])
{
struct random_forrest_data* rfd = 0;
FILE* file = 0;
int i,j,c,f,g;
char* tmp = 0;
int count;
int input_num_samples = 0;
char** sample_names = 0;
char* norm = 0;
int help = 0;
int quiet = 0;
int gzipped = 0;
char* column_spec = 0;
char* outfile = 0;
int num_tree = 1000;
int num_thread = 8;
int mtry = 0;
while (1){
static struct option long_options[] ={
{"sample",required_argument,0,OPT_SAMPLE},
{"norm",required_argument,0,OPT_NORM},
{"ntree",required_argument,0,OPT_NUM_TREE},
{"nthread",required_argument,0,OPT_NUM_THREAD},
{"mtry",required_argument,0,OPT_MTRY},
{"out",required_argument,0, 'o'},
{"quiet",0,0,'q'},
{"help",0,0,'h'},
{0, 0, 0, 0}
};
int option_index = 0;
c = getopt_long_only (argc, argv,"o:qh:",long_options, &option_index);
if (c == -1){
break;
}
switch(c) {
case 0:
break;
case OPT_MTRY:
mtry = atoi(optarg);
break;
case OPT_NUM_TREE:
num_tree = atoi(optarg);
break;
case OPT_NUM_THREAD:
num_thread = atoi(optarg);
break;
case OPT_NORM:
norm = optarg;
break;
case OPT_SAMPLE:
tmp = optarg;
count = byg_count(",", tmp);
//fprintf(stderr,"%d\n",count);
input_num_samples = count+1;
sample_names = malloc(sizeof(char*) * (count+1));
for(i = 0; i < count+1;i++){
sample_names[i] = malloc(sizeof(char)* 1000);
}
f = 0;
g = 0;
for(i = 0;i < strlen(tmp);i++){
if(tmp[i] != ','){
sample_names[f][g] = tmp[i];
g++;
}else{
sample_names[f][g] = 0;
f++;
g = 0;
}
}
sample_names[f][g] = 0;
break;
case 'q':
quiet = 1;
break;
case 'h':
help = 1;
break;
case 'o':
outfile = optarg;
break;
case '?':
exit(1);
break;
default:
fprintf(stderr,"default\n\n\n\n");
abort ();
}
}
char* db_file = 0;
char* reg_file = 0;
char* tar_file = 0;
db_file = argv[optind++];
if(!db_file){
fprintf(stderr,"No database file...\n");
return EXIT_FAILURE;
}
reg_file = argv[optind++];
if(!reg_file){
fprintf(stderr,"No database file...\n");
return EXIT_FAILURE;
}
tar_file = argv[optind++];
if(!tar_file){
fprintf(stderr,"No input file...\n");
return EXIT_FAILURE;
}
TOMEDB* db = 0;
db = main_read_db(db,db_file);
int lines_read = 0;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////
//////////////// Maximum nuber of regulators or targets - all need to fit in memory for learning
////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
clock_t cStartClock;
cStartClock = clock();
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////
//////////////// Loading & normalizing data
////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct tome_data* td_regulators = init_tome_data(100000,db->num_tracks );
file = open_track(reg_file, &gzipped);
column_spec = get_column_specs_based_on_file_suffix(reg_file);
lines_read = read_data_chunk(db,td_regulators,column_spec,file);
free(column_spec);
close_track(file, gzipped);
if(lines_read ==100000){
fprintf(stderr,"Warning: will only use the top %d lines from file %s\n", lines_read,reg_file);
}
td_regulators->us = init_use_samples(db,sample_names,0, input_num_samples ,0);
td_regulators = add_expression(db, td_regulators);
//removes regions with expression in less than half of the selected samples...
td_regulators = min_sample_filter(td_regulators, td_regulators->us->num_use_samples * 0.5);
//removed the unused samples;
td_regulators = remove_unused_samples(td_regulators);
//normalize expresssiom
td_regulators = norm_main(td_regulators, norm);
//copies pointers into the expression matrix
td_regulators = make_expression_matrix(td_regulators);
struct tome_data* td_targets = init_tome_data(100000,db->num_tracks );
file = open_track(tar_file, &gzipped);
column_spec = get_column_specs_based_on_file_suffix(tar_file);
lines_read = read_data_chunk(db,td_targets,column_spec,file);
free(column_spec);
close_track(file, gzipped);
if(lines_read ==100000){
fprintf(stderr,"Warning: will only use the top %d lines from file %s\n", lines_read,tar_file);
}
fprintf(stderr,"READ:%d\n",lines_read);
td_targets->us = init_use_samples(db,sample_names,0, input_num_samples ,0);
td_targets = add_expression(db, td_targets);
fprintf(stderr,"READ:%d\n",td_targets->numseq);
//removes regions with expression in less than half of the selected samples...
td_targets = min_sample_filter(td_targets, td_targets->us->num_use_samples * 0.5);
fprintf(stderr,"READ:%d\n",td_targets->numseq);
//removed the unused samples;
td_targets = remove_unused_samples(td_targets);
fprintf(stderr,"READ:%d\n",td_targets->numseq);
//normalize expresssiom
td_targets = norm_main(td_targets, norm);
fprintf(stderr,"READ:%d\n",td_targets->numseq);
//copies pointers into the expression matrix
td_targets = make_expression_matrix(td_targets);
fprintf(stderr,"READ:%d\n",td_targets->numseq);
//////
/// init random forest data ....
/////
rfd = malloc(sizeof(struct random_forrest_data));
rfd->num_regulators = td_regulators->numseq;
rfd->num_targets = td_targets->numseq;
rfd->num_samples = td_targets->num_samples;
rfd->num_tree = num_tree;
rfd->num_threads = num_thread;
if(!mtry){
mtry = (int)(rfd->num_regulators/ 3.0);
}
rfd->mtry = mtry;
rfd->regulators = malloc(sizeof(struct data_matrix));
rfd->regulators->mat = td_regulators->expression_matrix;
rfd->regulators->row_names = malloc(sizeof(char*) * td_regulators->numseq);
for(i = 0; i < td_regulators->numseq;i++){
rfd->regulators->row_names[i] = td_regulators->gc[i]->info;
}
rfd->targets = malloc(sizeof(struct data_matrix));
rfd->targets->mat = td_targets->expression_matrix;
rfd->targets->row_names = malloc(sizeof(char*) * td_targets->numseq);
for(i = 0; i < td_targets->numseq;i++){
rfd->targets->row_names[i] = td_targets->gc[i]->info;
}
rfd = create_rf_result(rfd);
cStartClock = clock();
rfd = start_rf_run(rfd);
fprintf(stderr,"Completed RF in %4.2f seconds\n",(double)( clock() - cStartClock) / (double)CLOCKS_PER_SEC);
cStartClock = clock();
rfd = start_rdt_fdr_for_rd_results(rfd );
fprintf(stderr,"Completed RDT in %4.2f seconds\n",(double)( clock() - cStartClock) / (double)CLOCKS_PER_SEC);
for(i = 0; i < rfd->num_targets;i++){
for(j = 0; j < rfd->num_regulators;j++){
fprintf(stdout,"%s\t%s\t%f\t%f\t%f\t%f\n", td_regulators->gc[j]->info, td_targets->gc[i]->info,rfd->rf_results[i]->variance_reduction[j],rfd->rf_results[i]->oobestimate[j] , rfd->rf_results[i]->zscore[j],rfd->rf_results[i]->fdr[j] );
}
}
free_tome_data(td_regulators);
free_tome_data(td_targets);
free_rf_result(rfd);
free(rfd);
free_db(db);
return EXIT_SUCCESS;
}