|
From: <boc...@su...> - 2006-02-22 09:15:37
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/lib/perl/Nelson/Pipe/Container/Job Modified Files: CreateProfiles.pm PPLARunner.pm PickTopProfiles.pm ReadProfileOutput.pm ReadVGLOutput.pm ScoreSamplesViaProfiles.pm ScoreSamplesViaVGL.pm SifFileParser.pm SummarizeClassification.pm Added Files: CreateProfiles_2.pm Log Message: Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts Index: ScoreSamplesViaProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaProfiles.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ScoreSamplesViaProfiles.pm 27 Sep 2005 22:54:38 -0000 1.5 --- ScoreSamplesViaProfiles.pm 22 Feb 2006 09:14:59 -0000 1.6 *************** *** 28,32 **** # for each file in the test set my @files; ! open LIST, $list_of_files or die; while(<LIST>) { chomp; --- 28,32 ---- # for each file in the test set my @files; ! open LIST, $list_of_files or die "Can't open $list_of_files\n"; while(<LIST>) { chomp; Index: PickTopProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PickTopProfiles.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** PickTopProfiles.pm 19 Aug 2005 23:17:09 -0000 1.5 --- PickTopProfiles.pm 22 Feb 2006 09:14:58 -0000 1.6 *************** *** 31,35 **** my @subdirs = split /,/, $self->{profile_sub_dirs}; ! my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})}; foreach my $subdir (@subdirs) { --- 31,35 ---- my @subdirs = split /,/, $self->{profile_sub_dirs}; ! #my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})}; foreach my $subdir (@subdirs) { *************** *** 42,46 **** foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) { foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) { ! #print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n"; if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) { ##my $cache = $data->{all_profiles}; # done for memory issues --- 42,46 ---- foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) { foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) { ! print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n"; if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) { ##my $cache = $data->{all_profiles}; # done for memory issues *************** *** 79,83 **** my ($self, $file) = @_; my @results; ! open PROFILES, $file or die; while(<PROFILES>) { chomp; --- 79,83 ---- my ($self, $file) = @_; my @results; ! open PROFILES, $file or die "Can't open file $file\n"; while(<PROFILES>) { chomp; --- NEW FILE: CreateProfiles_2.pm --- package Nelson::Pipe::Container::Job::CreateProfiles_2; use base qw(Nelson::Pipe::Container::Job); use strict; use Data::Dumper; use Storable; =head2 process Title : process Usage : Function: Example : Returns : Args : =cut sub process{ my ($self,$input,$output,$stash) = @_; my $file_list_dir = $input->{file_list_dir}{value}; my $file_list_sub_dir = $input->{file_list_sub_dir}{value}; my $mas5_dir = $input->{mas5_dir}{value}; my $profile_dir = $output->{profile_dir}{value}; my $sif_file_hash_name = $input->{sif_file_hash}{value}; my $sif_file_hash = $stash->{$sif_file_hash_name}; my $start = $input->{start}{value}; my $end = $input->{end}{value}; my $pre_cache_mas5 = $input->{pre_cache_mas5}{value}; my $mas5_cache_output = $output->{mas5_cache_output}{value}; # try to load the sif_file_hash if undef if (!defined($sif_file_hash)) { $sif_file_hash = retrieve($self->{sif_file_hash_storable}); } # pre-cache the mas5 results $self->{mas5_cache} = {}; if ($pre_cache_mas5 eq '1') { $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir); } my @sub_dirs = split /,/, $file_list_sub_dir; foreach my $sub_dir (@sub_dirs) { system("mkdir -p $profile_dir/$sub_dir"); my @files = glob("$file_list_dir/$sub_dir/*.txt"); foreach my $list_of_cel_files (@files) { $list_of_cel_files =~ /_(\d+)\.txt$/; my $file_num = $1; if($file_num >= $start && $file_num < $end) { $self->_process_cel_files($list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_file_hash); } } } if ($pre_cache_mas5 eq '1' and defined($mas5_cache_output)) { store $self->{mas5_cache}, $mas5_cache_output; } } sub _pre_cache_mas5 { my ($self, $mas5_dir) = @_; my $cache = {}; my @files = glob("$mas5_dir/*.txt"); foreach my $file (@files) { open FILE, "<$file" or die "can't open $file"; $file =~ /\/([^\/]+)\.txt$/; my $filename = $1; push @{$cache->{filenames}}, $filename; $file =~ s/ /_/g; $file =~ s/#/_/g; while(<FILE>) { my @tokens = split /\t/; if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) { #print "P-value: $tokens[5]\n"; exit; $tokens[4] =~ /([PAM])/; if ($tokens[5] <= 0.05) { ##if ($1 eq 'P') { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1; push @{$cache->{probes}{$tokens[0]}}, 1; } else { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0; push @{$cache->{probes}{$tokens[0]}}, 0; } } elsif ($self->{parse_old_mas5} == 0) { #print "New pvalue: $tokens[1]\n"; exit; $tokens[2] =~ /([PAM])/; if ($tokens[1] <= 0.05) { ##if ($1 eq 'P') { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1; push @{$cache->{probes}{$tokens[0]}}, 1; } else { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0; push @{$cache->{probes}{$tokens[0]}}, 0; } } } close FILE; #print Dumper($cache); exit; } return($cache); } sub _process_cel_files { my ($self, $list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_hash) = @_; my $results = {}; $list_of_cel_files =~ /file_list_(\d+)\.txt/; my $file_num = $1; open LIST, "<$list_of_cel_files" or die; my $i = 0; while(<LIST>) { chomp; my $filename = $_; my $old_filename = $filename; $filename =~ s/ /_/g; $filename =~ s/#/_/g; push @{$results->{samples}{names}}, $filename; #$results->{samples}{names}{$filename} = 1; print "Getting Annotations for $filename\n"; $self->_get_annotations($filename, $old_filename, $results, $sif_hash); #print "FOO ".Dumper($results); exit; print "Parsing File $filename\n"; $self->_parse_file($old_filename, $results, $mas5_dir); #print "On file $i $filename\n"; $i++; } close LIST; # at this point all the P/A calls are loaded for all the files # in the list, next print it out print "Printing profile\n"; my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; $self->_print_profile($results, $outfile); } sub _get_annotations { my ($self, $filename, $old_filename, $results, $sif_hash) = @_; #print Dumper($sif_hash); exit; # HC if ($sif_hash->{'files'}{$old_filename}{hc} =~ /1A/) { $results->{samples}{annotations}{HC_1A}{$filename} = 1; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /1B/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 1; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2A/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 1; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2B/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 1; } # tumor type if ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /mixed/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 1; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /gbm/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 1; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /oligo/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 1; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /astro/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 1; } # sex if ($sif_hash->{'files'}{$old_filename}{sex} =~ /F/) { $results->{samples}{annotations}{sex_f}{$filename} = 1; $results->{samples}{annotations}{sex_m}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{sex} =~ /M/) { $results->{samples}{annotations}{sex_f}{$filename} = 0; $results->{samples}{annotations}{sex_m}{$filename} = 1; } # grade if ($sif_hash->{'files'}{$old_filename}{grade} =~ /3/) { $results->{samples}{annotations}{grade_3}{$filename} = 1; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /4/) { $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 1; } # survival cluster if ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC1/) { $results->{samples}{annotations}{survival_cluster_1}{$filename} = 1; $results->{samples}{annotations}{survival_cluster_2}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC2/) { $results->{samples}{annotations}{survival_cluster_1}{$filename} = 0; $results->{samples}{annotations}{survival_cluster_2}{$filename} = 1; } # survival time # this is a bit more tricky because I want a profile for each survival time my $i=0; foreach my $survival_time (sort {$a <=> $b} keys %{$sif_hash->{'files_by_survival_time'}}) { $i++; if ($sif_hash->{'files'}{$old_filename}{survival_time} >= $survival_time) { $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 1; } else { $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 0; } } # These were added for the p53 Breast Cancer Dataset Miller et al 2005 # grade (grade 4 doesn't exist in this dataset!!) if ($sif_hash->{'files'}{$old_filename}{grade} =~ /G1/) { $results->{samples}{annotations}{grade_1}{$filename} = 1; $results->{samples}{annotations}{grade_2}{$filename} = 0; $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G2/) { $results->{samples}{annotations}{grade_1}{$filename} = 0; $results->{samples}{annotations}{grade_2}{$filename} = 1; $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G3/) { $results->{samples}{annotations}{grade_1}{$filename} = 0; $results->{samples}{annotations}{grade_2}{$filename} = 0; $results->{samples}{annotations}{grade_3}{$filename} = 1; $results->{samples}{annotations}{grade_4}{$filename} = 0; } # lymph_pos if ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /0/) { $results->{samples}{annotations}{lymph_pos}{$filename} = 0; $results->{samples}{annotations}{lymph_neg}{$filename} = 1; } elsif ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /1/) { $results->{samples}{annotations}{lymph_pos}{$filename} = 1; $results->{samples}{annotations}{lymph_neg}{$filename} = 0; } # er_wt if ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /1/) { $results->{samples}{annotations}{er_wt}{$filename} = 1; $results->{samples}{annotations}{er_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /0/) { $results->{samples}{annotations}{er_wt}{$filename} = 0; $results->{samples}{annotations}{er_mt}{$filename} = 1; } # pgr_wt if ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /1/) { $results->{samples}{annotations}{pgr_wt}{$filename} = 1; $results->{samples}{annotations}{pgr_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /0/) { $results->{samples}{annotations}{pgr_wt}{$filename} = 0; $results->{samples}{annotations}{pgr_mt}{$filename} = 1; } # dlda_mt if ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /1/) { $results->{samples}{annotations}{dlda_wt}{$filename} = 0; $results->{samples}{annotations}{dlda_mt}{$filename} = 1; } elsif ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /0/) { $results->{samples}{annotations}{dlda_wt}{$filename} = 1; $results->{samples}{annotations}{dlda_mt}{$filename} = 0; } # p53_wt if ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /1/) { $results->{samples}{annotations}{p53_wt}{$filename} = 1; $results->{samples}{annotations}{p53_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /0/) { $results->{samples}{annotations}{p53_wt}{$filename} = 0; $results->{samples}{annotations}{p53_mt}{$filename} = 1; } } sub _print_profile { my ($self, $results, $outfile) = @_; open OUTFILE, ">$outfile" or die "Can't open outfile: $outfile\n"; #print OUTFILE "samples\t".join("\t", sort keys %{$results->{samples}{names}}),"\n"; print OUTFILE "samples\t".join("\t", @{$results->{samples}{names}}),"\n"; #print "FOO\n"; #print Dumper($results->{samples}); exit; #print Dumper($results->{samples}{annotations}); exit; foreach my $anno (sort keys %{$results->{samples}{annotations}}) { print OUTFILE "$anno\t"; my $curr_col = 0; my $total_col = scalar(keys %{$results->{samples}{annotations}{$anno}}); #foreach my $samp_file (sort keys %{$results->{samples}{annotations}{$anno}}) { foreach my $samp_file (@{$results->{samples}{names}}) { $curr_col++; print OUTFILE $results->{samples}{annotations}{$anno}{$samp_file}; if ($curr_col < $total_col) { print OUTFILE "\t"; } } print OUTFILE "\n"; } foreach my $probe (sort keys %{$results->{probes}}) { #print "Size: ".scalar(@{$results->{probes}{$probe}})."\n"; #print OUTFILE "$probe\t".join("\t", @{$results->{probes}{$probe}}),"\n"; print OUTFILE "$probe\t"; #foreach my $filename (sort keys %{$results->{probes}{$probe}}) { foreach my $value (@{$results->{probes}{$probe}}) { #print OUTFILE $results->{probes}{$probe}{$filename}{'call'}, "\t"; print OUTFILE "$value\t"; } print OUTFILE "\n"; } close OUTFILE; } sub _parse_file { my ($self, $file, $results, $mas5_dir) = @_; if (defined($self->{mas5_cache}{probes})) { # then the cache is used # find the offset for this file #print "The file is: $file\n"; my $curr_index = 0; my $index = 0; foreach my $filename (@{$self->{mas5_cache}{filenames}}) { #print " other filename: $filename\n"; if ($filename eq $file) { $index = $curr_index; last; } $curr_index++; } #print "The index is: $index\n"; # now iterate over and copy calls to results structure foreach my $probe (keys %{$self->{mas5_cache}{probes}}) { #$results->{probes}{$probe}{$file}{'call'} = $self->{mas5_cache}{probes}{$probe}[$index]; push @{$results->{probes}{$probe}}, $self->{mas5_cache}{probes}{$probe}[$index]; } } else { # FIXME: I don't think this will work anymore since moved to arrays rather than hashes open FILE, "<$mas5_dir/$file.txt" or die "can't open $mas5_dir/$file"; my $filename = $file; $file =~ s/ /_/g; $file =~ s/#/_/g; while(<FILE>) { my @tokens = split /\t/; if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) { #print "P-value: $tokens[5]\n"; exit; $tokens[4] =~ /([PAM])/; if ($tokens[5] <= 0.05) { ##if ($1 eq 'P') { $results->{probes}{$tokens[0]}{$file}{'call'} = 1; } else { $results->{probes}{$tokens[0]}{$file}{'call'} = 0; } } elsif ($self->{parse_old_mas5} == 0) { #print "New pvalue: $tokens[1]\n"; exit; $tokens[2] =~ /([PAM])/; if ($tokens[1] <= 0.05) { ##if ($1 eq 'P') { $results->{probes}{$tokens[0]}{$file}{'call'} = 1; } else { $results->{probes}{$tokens[0]}{$file}{'call'} = 0; } } } close FILE; } } Index: SifFileParser.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SifFileParser.pm,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** SifFileParser.pm 17 Feb 2006 00:09:22 -0000 1.9 --- SifFileParser.pm 22 Feb 2006 09:15:00 -0000 1.10 *************** *** 25,29 **** my $samples; ! if ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); } else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); } $stash->{$output_hash_name} = $samples; --- 25,31 ---- my $samples; ! ! if ($sif_format eq 'simple') { $samples = $self->_read_simple_sample_list($sif_file); } ! elsif ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); } else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); } $stash->{$output_hash_name} = $samples; *************** *** 130,133 **** --- 132,156 ---- my $final_output = {'files_by_hc' => $self->{files_by_hc}, 'files' => $self->{files}, 'files_by_survival_time' => $self->{files_by_survival_time}}; return($final_output); + } + + # this just reads a three column tab file used by Marc's (Bin's) program + # that links filename (without extension) to phenotype. It's only useful + # for linking files to one phenotype class at a time + sub _read_simple_sample_list { + my ($self, $input_sample_list) = @_; + open INPUT, "<$input_sample_list" or die; + while (<INPUT>) { + chomp; + my @tokens = split /\t/; + my $filename = $tokens[1]; + my $pheno = $tokens[2]; + if ($pheno eq 'TAL_R') { $self->{files}{$filename}{TAL_R} = 1; } + else { $self->{files}{$filename}{TAL_R} = 0; } + } + close INPUT; + + my $final_output = {'files' => $self->{files}}; + return($final_output); + } Index: CreateProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** CreateProfiles.pm 17 Feb 2006 00:09:22 -0000 1.15 --- CreateProfiles.pm 22 Feb 2006 09:14:57 -0000 1.16 *************** *** 28,31 **** --- 28,32 ---- my $end = $input->{end}{value}; my $pre_cache_mas5 = $input->{pre_cache_mas5}{value}; + my $no_overwrite = $input->{no_overwrite}{value}; my $mas5_cache_output = $output->{mas5_cache_output}{value}; *************** *** 115,118 **** --- 116,121 ---- $list_of_cel_files =~ /file_list_(\d+)\.txt/; my $file_num = $1; + my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; + if ($self->{no_overwrite} && -f "$profile_dir/$sub_dir/file_list_$file_num.profile") { return; } open LIST, "<$list_of_cel_files" or die; my $i = 0; *************** *** 136,140 **** # in the list, next print it out print "Printing profile\n"; - my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; $self->_print_profile($results, $outfile); } --- 139,142 ---- *************** *** 297,300 **** --- 299,310 ---- } + # These were added for the T-cell leukemia dataset Soulier et al 2005 + if ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /1/) { + $results->{samples}{annotations}{TAL_R}{$filename} = 1; + $results->{samples}{annotations}{HOX_R}{$filename} = 0; + } elsif ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /0/) { + $results->{samples}{annotations}{TAL_R}{$filename} = 0; + $results->{samples}{annotations}{HOX_R}{$filename} = 1; + } } Index: ReadVGLOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ReadVGLOutput.pm 31 Oct 2005 19:02:21 -0000 1.6 --- ReadVGLOutput.pm 22 Feb 2006 09:14:59 -0000 1.7 *************** *** 24,30 **** --- 24,36 ---- my $profile_output_dir = $self->{profile_output_dir}; my $profile_output_sub_dirs = $self->{profile_output_sub_dirs}; + my $compare_to_reference = $self->{compare_to_reference}; my $reference_profile = $self->{reference_profile}; my $output_dir = $self->{output_dir}; my $col_offset = $self->{col_offset}; + my $parser_type = $self->{parser_type}; + my $base_col = $self->{base_col}; + my $pheno_str_1 = $self->{pheno_str_1}; + my $pheno_str_2 = $self->{pheno_str_2}; + my $input_file_name = $self->{input_file_name}; # the hash that holds hash and cache data *************** *** 36,41 **** # the reference sample ! my $reference = $self->_parse_vgl_output($reference_profile, 0); ! $results->{parsed_output}{'100'}{reference} = $reference; # comment out, these may have already been created --- 42,50 ---- # the reference sample ! my $reference; ! if ($compare_to_reference) { ! $reference = $self->_parse_vgl_output($reference_profile, 0); ! $results->{parsed_output}{'100'}{reference} = $reference; ! } # comment out, these may have already been created *************** *** 56,61 **** my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results"); foreach my $file (@files) { ! print "Curr Profile: $file/All_DChip_expression_vals_longNames_groupTtest.xls\n"; ! my $curr_output = $self->_parse_vgl_output("$file/All_DChip_expression_vals_longNames_groupTtest.xls", $col_offset); # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); --- 65,72 ---- my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results"); foreach my $file (@files) { ! print "Curr Profile: $file/$input_file_name\n"; ! my $curr_output; ! if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); } ! else { $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); } # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); *************** *** 64,68 **** ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } --- 75,79 ---- ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! if ($compare_to_reference) { $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); } $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } *************** *** 75,78 **** --- 86,90 ---- # DEBUG + print "DEBUG DUMP!\n"; print Dumper($results->{comparison_to_reference}); print Dumper($results->{frequency}); *************** *** 113,116 **** --- 125,160 ---- } } + + # only one column needs to be examined when there is one phenotype with two states + # FIXME: the phenotype is hardcoded here!!! + sub _parse_2_pheno_vgl_output { + my ($self, $file, $offset, $base, $pheno_str_1, $pheno_str_2) = @_; + open INPUT, "<$file" or die "Can't open file: $file\n"; + my $result; + while(<INPUT>) { + if (!/^probe.set/) { + chomp; + my @tokens = split /\t/; + my $HC = ""; + my $pvalue = $tokens[$base+5]; + my $mean = $tokens[$base+3]; + my $true_count = 0; + if ($tokens[$base+6] eq "TRUE" && $tokens[$base+3] >= 0) { + $HC = $pheno_str_1; + $true_count++; + } elsif ($tokens[$base+6] eq "TRUE" && $tokens[$base+3] < 0) { + $HC = $pheno_str_2; + $true_count++; + } + if ($true_count == 1) { + $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; + $result->{$HC}{$tokens[0]}{mean} = $mean + } + } + } + close INPUT; + return($result); + } + sub _parse_vgl_output { Index: SummarizeClassification.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SummarizeClassification.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** SummarizeClassification.pm 5 Oct 2005 00:04:20 -0000 1.5 --- SummarizeClassification.pm 22 Feb 2006 09:15:00 -0000 1.6 *************** *** 64,68 **** my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation}; print "Highest: $highest_phenotype Correct: $correct_annotation\n"; ! if ($highest_phenotype =~ /$correct_annotation/) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; print "CORRECT!!!\n"; } --- 64,68 ---- my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation}; print "Highest: $highest_phenotype Correct: $correct_annotation\n"; ! if ($highest_phenotype =~ /$correct_annotation/i) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; print "CORRECT!!!\n"; } Index: PPLARunner.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PPLARunner.pm,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** PPLARunner.pm 8 Dec 2005 02:30:46 -0000 1.7 --- PPLARunner.pm 22 Feb 2006 09:14:58 -0000 1.8 *************** *** 21,24 **** --- 21,25 ---- my $start = $input->{start}{value}; my $end = $input->{end}{value}; + my $no_overwrite = $input->{no_overwrite}{value}; foreach my $subdir (split /,/, $self->{profiles_sub_dirs}) { *************** *** 26,30 **** print "DIRECTORY: ".$self->{profiles_dir}."/$subdir\n"; foreach my $file (@files) { - $file =~ /_(\d+)\.profile$/; my $file_num = $1; --- 27,30 ---- *************** *** 34,41 **** $file =~ /\/(\w+).profile$/; my $filename = $1; my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"; print STDERR "$command\n"; - system($command); system("mkdir -p ".$self->{output_dir}."/".$subdir); # FIXME: what do I need to do to get logging working!?!? #$self->log("PPLARunner", $command); --- 34,42 ---- $file =~ /\/(\w+).profile$/; my $filename = $1; + next if ($no_overwrite && -f $self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"); my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"; print STDERR "$command\n"; system("mkdir -p ".$self->{output_dir}."/".$subdir); + system($command); # FIXME: what do I need to do to get logging working!?!? #$self->log("PPLARunner", $command); Index: ScoreSamplesViaVGL.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** ScoreSamplesViaVGL.pm 31 Oct 2005 19:02:21 -0000 1.4 --- ScoreSamplesViaVGL.pm 22 Feb 2006 09:14:59 -0000 1.5 *************** *** 28,31 **** --- 28,32 ---- my $exp_values = $self->_read_exp_values($self->{exp_values}); + #print Dumper($exp_values); exit; # for each file in the test set *************** *** 144,148 **** while(<IN>) { chomp; ! if (/^probe_set/) { @files = split /\t/; shift @files; --- 145,149 ---- while(<IN>) { chomp; ! if (/^probe.set/) { @files = split /\t/; shift @files; Index: ReadProfileOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadProfileOutput.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** ReadProfileOutput.pm 8 Dec 2005 02:30:46 -0000 1.8 --- ReadProfileOutput.pm 22 Feb 2006 09:14:59 -0000 1.9 *************** *** 23,26 **** --- 23,27 ---- my $profile_output_sub_dirs = $self->{profile_output_sub_dirs}; my $reference_profile = $self->{reference_profile}; + my $compare_to_reference = $self->{compare_to_reference}; my $output_dir = $self->{output_dir}; my @profiles = split /,/, $self->{profiles_to_count}; *************** *** 30,35 **** # the reference sample ! my $reference = $self->_parse_ppla_output($reference_profile, \@profiles); ! $results->{parsed_output}{'100'}{reference} = $reference; $Data::Dumper::Indent = 1; --- 31,39 ---- # the reference sample ! my $reference; ! if ($compare_to_reference) { ! $reference = $self->_parse_ppla_output($reference_profile, \@profiles); ! $results->{parsed_output}{'100'}{reference} = $reference; ! } $Data::Dumper::Indent = 1; *************** *** 66,70 **** $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } --- 70,74 ---- $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! if ($compare_to_reference) { $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); } $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } *************** *** 76,81 **** # DEBUG #print Dumper($results->{comparison_to_reference}); ! #print Dumper(keys %{$results->{frequency}{50}}); } --- 80,87 ---- # DEBUG + #print Dumper (keys %{$results}); #print Dumper($results->{comparison_to_reference}); ! print Dumper(keys %{$results->{frequency}{75}}); ! print Dumper($results->{frequency}{75}); } |