|
From: <boc...@su...> - 2006-02-28 07:36:47
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv3630/lib/perl/Nelson/Pipe/Container/Job Modified Files: ReadVGLOutput.pm ScoreSamplesViaVGL.pm Log Message: I did a code review on these modules to make sure the performance I'm seeing with VGL is correct. I found some bugs, specifically in the 1A category (due to a problem with the VGL output format) and also with the mean of the category being used rather than the other categories means. These problems have been fixed yet the performance looks the same if not a little worse. Index: ScoreSamplesViaVGL.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ScoreSamplesViaVGL.pm 23 Feb 2006 22:11:59 -0000 1.6 --- ScoreSamplesViaVGL.pm 28 Feb 2006 07:36:43 -0000 1.7 *************** *** 116,120 **** foreach my $probe (keys %{$top_profiles->{$hc}}) { #print "On File: $file using HC: $hc using Probe: $probe\n"; ! my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts}; my $sample_mean = $exp_values->{$file}{$probe}; #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n"; --- 116,147 ---- foreach my $probe (keys %{$top_profiles->{$hc}}) { #print "On File: $file using HC: $hc using Probe: $probe\n"; ! # BUG: I think this should actually be the others_mean! ! #my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts}; ! my $score_mean = $top_profiles->{$hc}{$probe}{others_mean} / $top_profiles->{$hc}{$probe}{counts}; ! ! # just used one vgl output for this ! #The probe: 202189_x_at ! #$VAR1 = { ! # 'others_mean' => '1271.90763461538', ! # 'pvalue' => '2.13236165697335e-12', ! # 'mean' => '601.166214285714', ! # 'counts' => 3 ! # }; ! ! # something is a bit fishy, the counts above, why isn't it 1? ! # now if I use the first 10 vgl output files: ! #The probe: 202189_x_at ! #$VAR1 = { ! # 'others_mean' => '14165.1471730769', ! # 'pvalue' => '1.16199434367298e-10', ! # 'mean' => '6678.66514285714', ! # 'counts' => 33 ! # }; ! # where does 33 come from? Also, is that an average p-value? I need to debug where this structure is coming from! ! #print "The probe: 202189_x_at\n"; #202189_x_at ! #print Dumper $top_profiles->{$hc}{'202189_x_at'}; exit; ! # FIXME: the count problem was isolated to the parsing script and should be fixed ! ! my $sample_mean = $exp_values->{$file}{$probe}; #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n"; *************** *** 127,133 **** # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression # then the others ! #if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) { ! if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) { #print " Yes this is positive for $hc\n"; $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; } else { --- 154,162 ---- # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression # then the others ! if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2 && $top_profiles->{$hc}{$probe}{mean} > $top_profiles->{$hc}{$probe}{others_mean}) { ! #if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) { #print " Yes this is positive for $hc\n"; + $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; + } elsif (defined($sample_mean) && abs($sample_mean/$score_mean) <= 0.5 && $top_profiles->{$hc}{$probe}{mean} < $top_profiles->{$hc}{$probe}{others_mean}) { $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; } else { Index: ReadVGLOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** ReadVGLOutput.pm 23 Feb 2006 22:11:58 -0000 1.8 --- ReadVGLOutput.pm 28 Feb 2006 07:36:42 -0000 1.9 *************** *** 9,12 **** --- 9,15 ---- # FIXME: doesn't work with multiple subdirs + # DEBUG + my $foo = {}; + =head2 process *************** *** 123,128 **** foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) { $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info}; ! $results->{$hc}{$probe}{counts}++; } } } --- 126,133 ---- foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) { $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info}; ! # FIXME: is there a reason the counts might not be accurate here???!?!?! ! #$results->{$hc}{$probe}{counts}++; } + $results->{$hc}{$probe}{counts}++; } } *************** *** 160,164 **** } ! sub _parse_vgl_output { my ($self, $file, $offset) = @_; --- 165,170 ---- } ! # FIXME: the program that makes the VGL output reverses the order of one of the output ! # columns, make sure this is taking into account when the following code is run sub _parse_vgl_output { my ($self, $file, $offset) = @_; *************** *** 187,195 **** my $pvalue = 0; my $mean = 0; my $true_count = 0; if ($tokens[97-$offset] eq "TRUE") { $HC = $classification->{97}; $pvalue = $tokens[96-$offset]; ! $mean = $tokens[93-$offset]; $true_count++; } if ($tokens[104-$offset] eq "TRUE") { --- 193,206 ---- my $pvalue = 0; my $mean = 0; + my $others_mean = 0; my $true_count = 0; if ($tokens[97-$offset] eq "TRUE") { $HC = $classification->{97}; $pvalue = $tokens[96-$offset]; ! # BUG: the column changes here, the first entry is reversed where the mean of 1A is first and ! # the second column 1B_2A_2B mean ! #$mean = $tokens[93-$offset]; ! $mean = $tokens[92-$offset]; ! $others_mean = $tokens[93-$offset]; $true_count++; } if ($tokens[104-$offset] eq "TRUE") { *************** *** 197,200 **** --- 208,212 ---- $pvalue = $tokens[103-$offset]; $mean = $tokens[100-$offset]; + $others_mean = $tokens[99-$offset]; $true_count++; } if ($tokens[111-$offset] eq "TRUE") { *************** *** 202,209 **** --- 214,223 ---- $pvalue = $tokens[110-$offset]; $mean = $tokens[107-$offset]; + $others_mean = $tokens[106-$offset]; $true_count++; } if ($tokens[118-$offset] eq "TRUE") { $HC = $classification->{118}; $pvalue = $tokens[117-$offset]; + $others_mean = $tokens[116-$offset]; $mean = $tokens[114-$offset]; $true_count++; *************** *** 219,223 **** #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1; $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; ! $result->{$HC}{$tokens[0]}{mean} = $mean } } --- 233,243 ---- #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1; $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; ! $result->{$HC}{$tokens[0]}{mean} = $mean; ! $result->{$HC}{$tokens[0]}{others_mean} = $others_mean; ! if ($mean > $others_mean) { ! #print "Mean of $mean is greater than others mean of $others_mean for $tokens[0]\n"; ! } else { ! #print "Mean of $mean is less than others mean of $others_mean for $tokens[0]\n"; ! } } } |