You can subscribe to this list here.
| 2006 |
Jan
|
Feb
(38) |
Mar
(5) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
|---|
|
From: <all...@su...> - 2006-03-14 18:19:07
|
Update of /cvsroot/libnelson/Pg/celsius/bin In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8582/bin Modified Files: celsius-extract Log Message: updates for marc Index: celsius-extract =================================================================== RCS file: /cvsroot/libnelson/Pg/celsius/bin/celsius-extract,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** celsius-extract 16 Feb 2006 02:49:35 -0000 1.5 --- celsius-extract 14 Mar 2006 18:18:39 -0000 1.6 *************** *** 35,39 **** $sn =~ s/^SN://; ! my $dbh1 = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','',''); if ( $method eq 'cel' ) { --- 35,39 ---- $sn =~ s/^SN://; ! my $dbh1 = DBI->connect('dbi:Pg:dbname=chado-celsius;host=torso.genomics.ctrl.ucla.edu'); if ( $method eq 'cel' ) { *************** *** 56,67 **** elsif ( $valid_method{ $method } ) { #Brian may want the mas5.call and mas5.p my $sth = $dbh1->prepare(qq( SET search_path TO cel, part_elementresult, public; ! SELECT d1.name || x1.accession AS accession, x2.accession AS probeset, $method.signal ! FROM part_elementresult.$method, cel, cel_dbxref AS cx, element, dbxref AS x1, dbxref AS x2, db AS d1, quantification AS q ! WHERE $method.quantification_id = q.quantification_id AND q.acquisition_id = cel.cel_id ! AND $method.element_id = element.element_id AND element.dbxref_id = x2.dbxref_id AND cel.cel_id = cx.cel_id --- 56,68 ---- elsif ( $valid_method{ $method } ) { #Brian may want the mas5.call and mas5.p + my $sth = $dbh1->prepare(qq( SET search_path TO cel, part_elementresult, public; ! SELECT d1.name || x1.accession AS accession, x2.accession AS probeset, r.signal ! FROM part_elementresult.${method}_byq AS r, cel, cel_dbxref AS cx, element, dbxref AS x1, dbxref AS x2, db AS d1, quantification AS q ! WHERE r.quantification_id = q.quantification_id AND q.acquisition_id = cel.cel_id ! AND r.element_id = element.element_id AND element.dbxref_id = x2.dbxref_id AND cel.cel_id = cx.cel_id |
|
From: <all...@su...> - 2006-03-13 23:22:58
|
Update of /cvsroot/Geo-Google/lib/Geo In directory sumo.genetics.ucla.edu:/tmp/cvs-serv19028/lib/Geo Modified Files: Google.pm Log Message: version bump for release Index: Google.pm =================================================================== RCS file: /cvsroot/Geo-Google/lib/Geo/Google.pm,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** Google.pm 13 Mar 2006 23:22:24 -0000 1.3 --- Google.pm 13 Mar 2006 23:22:53 -0000 1.4 *************** *** 122,126 **** use strict; use warnings; ! our $VERSION = '0.01'; #this gets a javascript page containing map XML --- 122,126 ---- use strict; use warnings; ! our $VERSION = '0.02'; #this gets a javascript page containing map XML |
|
From: <all...@su...> - 2006-03-13 23:22:29
|
Update of /cvsroot/Geo-Google/lib/Geo/Google In directory sumo.genetics.ucla.edu:/tmp/cvs-serv18966/lib/Geo/Google Modified Files: Location.pm Log Message: update to specify version number of gmaps api Index: Location.pm =================================================================== RCS file: /cvsroot/Geo-Google/lib/Geo/Google/Location.pm,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** Location.pm 3 Aug 2005 15:56:07 -0000 1.3 --- Location.pm 13 Mar 2006 23:22:24 -0000 1.4 *************** *** 61,65 **** =head1 SYNOPSIS ! use Geo::Google::Location; # you shouldn't need to construct these yourself, # have a Geo::Google object do it for you. --- 61,65 ---- =head1 SYNOPSIS ! use Geo::Google::Point; # you shouldn't need to construct these yourself, # have a Geo::Google object do it for you. |
|
From: <all...@su...> - 2006-03-13 23:22:29
|
Update of /cvsroot/Geo-Google/lib/Geo In directory sumo.genetics.ucla.edu:/tmp/cvs-serv18966/lib/Geo Modified Files: Google.pm Log Message: update to specify version number of gmaps api Index: Google.pm =================================================================== RCS file: /cvsroot/Geo-Google/lib/Geo/Google.pm,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** Google.pm 2 Aug 2005 05:48:12 -0000 1.2 --- Google.pm 13 Mar 2006 23:22:24 -0000 1.3 *************** *** 125,132 **** #this gets a javascript page containing map XML ! use constant LQ => 'http://maps.google.com/maps?output=js&q=%s'; #this gets a javascript page containing map XML. special for "nearby" searches ! use constant NQ => 'http://maps.google.com/maps?output=js&near=%s&q=%s'; #used in polyline codec --- 125,132 ---- #this gets a javascript page containing map XML ! use constant LQ => 'http://maps.google.com/maps?output=js&v=1&q=%s'; #this gets a javascript page containing map XML. special for "nearby" searches ! use constant NQ => 'http://maps.google.com/maps?output=js&v=1&near=%s&q=%s'; #used in polyline codec |
|
From: <all...@su...> - 2006-03-11 02:56:06
|
Update of /cvsroot/libnelson/Pg/celsius/bin In directory sumo.genetics.ucla.edu:/tmp/cvs-serv3867/bin Modified Files: profile.pl Log Message: updates to use torso. also added "-m" option (untested) to allow specification of full profile, rather than dimension coordinates to create matrix from db dynamically. Index: profile.pl =================================================================== RCS file: /cvsroot/libnelson/Pg/celsius/bin/profile.pl,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** profile.pl 3 Feb 2006 01:28:58 -0000 1.5 --- profile.pl 11 Mar 2006 02:56:02 -0000 1.6 *************** *** 7,10 **** --- 7,11 ---- my $cel_file; my $element_file; + my $matrix_file; my $verbose; my $help; *************** *** 13,21 **** "cel|c=s" => \$cel_file, "element|e=s" => \$element_file, "verbose|v" => \$verbose, "help|h" => \$help ); ! if ( ! $cel_file or ! $element_file or $help ) { print <<"USAGE"; Usage: $0 [-v] -c <file of SN accessions> -e <file of probeset identifiers> --- 14,23 ---- "cel|c=s" => \$cel_file, "element|e=s" => \$element_file, + "matrix|m=s" => \$matrix_file, "verbose|v" => \$verbose, "help|h" => \$help ); ! if ( !($matrix_file or ($cel_file and $element_file)) or $help ) { print <<"USAGE"; Usage: $0 [-v] -c <file of SN accessions> -e <file of probeset identifiers> *************** *** 24,31 **** Celsius warehouse. ! This program takes two inputs: 1) a list of SN identifiers as provisioned by the celsius CEL warehouse 2) a list of Affymetrix probeset identifiers. $0 caculates Euclidean distance in a P-dimensional space where P is the number of probesets in the probeset identifier file. The median values of all SN --- 26,39 ---- Celsius warehouse. ! This program can operate in two modes. ! ! The first mode requires the -c and -e options, taking two inputs: 1) a list of SN identifiers as provisioned by the celsius CEL warehouse 2) a list of Affymetrix probeset identifiers. + The second mode requires the -m options, taking one input: + * a matrix, columns as chip IDs (your own), rows as Affymetrix probeset + identifiers. + $0 caculates Euclidean distance in a P-dimensional space where P is the number of probesets in the probeset identifier file. The median values of all SN *************** *** 45,93 **** } ! my $dbh = DBI->connect('dbi:Pg:dbname=modulus;host=soleus.ctrl.ucla.edu','allenday',''); ! my $element_sth = $dbh->prepare('SELECT element_id FROM element WHERE name = ?'); ! my $signal_sth = $dbh->prepare("SELECT cel.db || ':' || cel.accession AS accession, element.name, result.signal FROM cel, element, result WHERE cel.cel_id = result.cel_id AND element.element_id = result.element_id AND element.element_id = ?"); my @cel; - print STDERR "reading cel file..." if $verbose; - open(F, $cel_file); - my @cel = <F>; - chomp @cel; - close(F); - print STDERR "done\n" if $verbose; - my @element; ! print STDERR "reading element file..." if $verbose; ! open(F, $element_file); ! my @element = <F>; ! chomp @element; ! close(F); ! print STDERR "done\n" if $verbose; my %result = (); my %percentile = (); - my %profile = (); - my %element = (); my %sample = map {$_=>1} @cel; foreach my $e ( @element ) { - $element_sth->execute( $e ); - my ( $id ) = $element_sth->fetchrow_array(); - die "no id for $e" unless $id; - $element{ $e } = $id; - print STDERR "element_id for $e = $id\n" if $verbose; - } - - foreach my $e ( keys %element ) { my $full_dist = Statistics::Descriptive::Full->new(); ! my $prof_dist = Statistics::Descriptive::Full->new(); my @accessions = (); print STDERR "retrieving signal for element $e..." if $verbose; ! $signal_sth->execute( $element{ $e } ); while ( my $row = $signal_sth->fetchrow_hashref ) { $full_dist->add_data( $row->{ 'signal' } ); ! if ( $sample{ $row->{ 'accession' } } ) { $prof_dist->add_data( $row->{ 'signal' } ); } --- 53,124 ---- } ! my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=torso.genomics.ctrl.ucla.edu'); ! $dbh->do('SET search_path TO cel, annot, part_elementresult, public'); ! my $element_sth = $dbh->prepare('SELECT element_id FROM element AS e, dbxref AS x WHERE e.dbxref_id = x.dbxref_id AND x.accession = ?'); ! my $signal_sth = $dbh->prepare(" ! SELECT d.name || ':' || x.accession AS accession, r.signal FROM rma AS r, quantification AS q, cel AS c, cel_dbxref AS cx, dbxref AS x, db AS d WHERE r.element_id = (SELECT element_id FROM element WHERE dbxref_id = (SELECT dbxref_id FROM dbxref WHERE accession = ?)) AND r.quantification_id = q.quantification_id AND c.cel_id = q.acquisition_id AND c.cel_id = cx.cel_id AND cx.dbxref_id = x.dbxref_id AND x.db_id = d.db_id AND d.name = 'SN' ! "); ! my $annotation_sth = $dbh->prepare("SELECT DISTINCT c.name FROM cvterm AS c, dbxref AS x1, dbxref AS x2, cel_dbxref AS cx, acquisition AS q, biomaterialprop AS p WHERE c.cvterm_id = p.type_id AND p.biomaterial_id = q.assay_id AND q.acquisition_id = cx.cel_id AND cx.dbxref_id = x1.dbxref_id AND x1.accession = ? AND c.dbxref_id = x2.dbxref_id AND x2.db_id = (SELECT db_id FROM db WHERE name = ?) AND p.part != 'allenday_tumor' ORDER BY c.name"); my @cel; my @element; ! my %matrix = (); ! ! if ( $matrix_file ) { ! print STDERR "reading matrix file..." if $verbose; ! open(F, $matrix_file) or die "couldn't open matrix file '$matrix_file': $!"; ! my $cel_line = <F>; ! chomp $cel_line; ! @cel = split /\t/, $cel_line; ! shift @cel; ! while ( my $element_line = <F> ) { ! chomp $element_line; ! my ( $e, @signal ) = split /\t/, $element_line; ! push @element, $e; ! my $element_dist = Statistics::Descriptive::Full->new(); ! $element_dist->add_data( @signal ); ! $matrix{ $e } = $element_dist; ! } ! close(F); ! } ! else { ! print STDERR "reading cel file..." if $verbose; ! open(F, $cel_file) or die "couldn't open cel file '$cel_file': $!"; ! @cel = <F>; ! chomp @cel; ! close(F); ! print STDERR "done\n" if $verbose; ! ! print STDERR "reading element file..." if $verbose; ! open(F, $element_file) or die "couldn't open element file '$element_file': $!"; ! @element = <F>; ! chomp @element; ! close(F); ! print STDERR "done\n" if $verbose; ! } my %result = (); my %percentile = (); my %sample = map {$_=>1} @cel; + my %profile = (); foreach my $e ( @element ) { my $full_dist = Statistics::Descriptive::Full->new(); ! ! my $prof_dist; ! if ( $matrix_file ) { ! $prof_dist = $matrix{ $e }; ! } ! else { ! $prof_dist = Statistics::Descriptive::Full->new(); ! } my @accessions = (); print STDERR "retrieving signal for element $e..." if $verbose; ! $signal_sth->execute( $e ); while ( my $row = $signal_sth->fetchrow_hashref ) { $full_dist->add_data( $row->{ 'signal' } ); ! if ( (!$matrix_file) and $sample{ $row->{ 'accession' } } ) { $prof_dist->add_data( $row->{ 'signal' } ); } *************** *** 133,140 **** } $distance = sqrt( $distance ); ! push @n, [$c, $distance]; } ! foreach my $n ( sort { $a->[1] <=> $b->[1] } @n ) { ! print $n->[0] ."\t". $n->[1] ."\n"; } --- 164,190 ---- } $distance = sqrt( $distance ); ! ! ! push @n, [$distance, $c]; } ! foreach my $n ( sort { $a->[0] <=> $b->[0] } @n ) { ! my $mpath = get_annotations( $n->[1], 'MPATH' ); ! my $phenotype = get_annotations( $n->[1], 'MP' ); ! my $cell = get_annotations( $n->[1], 'CL' ); ! my $anatomy = get_annotations( $n->[1], 'MA' ); ! my $etc = get_annotations( $n->[1], 'null' ); ! print join( "\t", ( @{$n}, $anatomy, $mpath, $phenotype, $cell, $etc ) ), "\n"; ! } ! ! sub get_annotations { ! my $snid = shift; ! my $dbspace = shift; ! $snid =~ s/SN://; ! $annotation_sth->execute($snid, $dbspace); ! my @a; ! while ( my ( $name ) = $annotation_sth->fetchrow_array() ) { ! push @a, $name; ! } ! return join ';', @a; } |
|
From: <all...@su...> - 2006-02-28 19:49:05
|
Update of /cvsroot/libnelson/java In directory sumo.genetics.ucla.edu:/tmp/cvs-serv26404 Modified Files: gecCel.java Log Message: get new samples first Index: gecCel.java =================================================================== RCS file: /cvsroot/libnelson/java/gecCel.java,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** gecCel.java 28 Feb 2006 19:37:25 -0000 1.4 --- gecCel.java 28 Feb 2006 19:48:56 -0000 1.5 *************** *** 17,21 **** public class gecCel { public static void main(String[] argv) { ! String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id"; // String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref! _id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?"; --- 17,21 ---- public class gecCel { public static void main(String[] argv) { ! String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id ORDER BY sample_id DESC"; // String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref! _id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?"; |
|
From: <all...@su...> - 2006-02-28 19:37:29
|
Update of /cvsroot/libnelson/java In directory sumo.genetics.ucla.edu:/tmp/cvs-serv25298 Modified Files: gecCel.java Log Message: removed dup attribute Index: gecCel.java =================================================================== RCS file: /cvsroot/libnelson/java/gecCel.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** gecCel.java 27 Feb 2006 07:58:07 -0000 1.3 --- gecCel.java 28 Feb 2006 19:37:25 -0000 1.4 *************** *** 44,48 **** "sample_id=\"" + sample_id + "\" " + "user_id=\"" + user_id + "\" " + - "run_id=\"" + run_id + "\" " + "experiment_id=\"" + exp_id + "\" " + ">"); --- 44,47 ---- |
|
From: <boc...@su...> - 2006-02-28 07:41:09
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/conf In directory sumo.genetics.ucla.edu:/tmp/cvs-serv4067 Modified Files: original_glioma_classification_with_vgl.xml original_glioma_vgl_parsing_pipe.xml Log Message: I did a code review on these modules to make sure the performance I'm seeing with VGL is correct. I found some bugs, specifically in the 1A category (due to a problem with the VGL output format) and also with the mean of the category being used rather than the other categories means. These problems have been fixed yet the performance looks the same if not a little worse. Index: original_glioma_classification_with_vgl.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_vgl.xml,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** original_glioma_classification_with_vgl.xml 23 Feb 2006 22:11:58 -0000 1.2 --- original_glioma_classification_with_vgl.xml 28 Feb 2006 07:41:06 -0000 1.3 *************** *** 33,37 **** </output> </step> ! <step id="48.4" active="0" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> --- 33,37 ---- </output> </step> ! <step id="48.4" active="1" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> Index: original_glioma_vgl_parsing_pipe.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_vgl_parsing_pipe.xml,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** original_glioma_vgl_parsing_pipe.xml 23 Feb 2006 22:11:58 -0000 1.2 --- original_glioma_vgl_parsing_pipe.xml 28 Feb 2006 07:41:06 -0000 1.3 *************** *** 7,11 **** way around this would be for Marc to alter the output so the summary columns are in a fixed position. --> ! <step id="30.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadVGLOutput"> <input> <item id="profile_output_dir" value="data/[% datadir %]/vgl"/> --- 7,11 ---- way around this would be for Marc to alter the output so the summary columns are in a fixed position. --> ! <step id="30.2" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadVGLOutput"> <input> <item id="profile_output_dir" value="data/[% datadir %]/vgl"/> |
|
From: <boc...@su...> - 2006-02-28 07:36:47
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv3630/lib/perl/Nelson/Pipe/Container/Job Modified Files: ReadVGLOutput.pm ScoreSamplesViaVGL.pm Log Message: I did a code review on these modules to make sure the performance I'm seeing with VGL is correct. I found some bugs, specifically in the 1A category (due to a problem with the VGL output format) and also with the mean of the category being used rather than the other categories means. These problems have been fixed yet the performance looks the same if not a little worse. Index: ScoreSamplesViaVGL.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ScoreSamplesViaVGL.pm 23 Feb 2006 22:11:59 -0000 1.6 --- ScoreSamplesViaVGL.pm 28 Feb 2006 07:36:43 -0000 1.7 *************** *** 116,120 **** foreach my $probe (keys %{$top_profiles->{$hc}}) { #print "On File: $file using HC: $hc using Probe: $probe\n"; ! my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts}; my $sample_mean = $exp_values->{$file}{$probe}; #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n"; --- 116,147 ---- foreach my $probe (keys %{$top_profiles->{$hc}}) { #print "On File: $file using HC: $hc using Probe: $probe\n"; ! # BUG: I think this should actually be the others_mean! ! #my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts}; ! my $score_mean = $top_profiles->{$hc}{$probe}{others_mean} / $top_profiles->{$hc}{$probe}{counts}; ! ! # just used one vgl output for this ! #The probe: 202189_x_at ! #$VAR1 = { ! # 'others_mean' => '1271.90763461538', ! # 'pvalue' => '2.13236165697335e-12', ! # 'mean' => '601.166214285714', ! # 'counts' => 3 ! # }; ! ! # something is a bit fishy, the counts above, why isn't it 1? ! # now if I use the first 10 vgl output files: ! #The probe: 202189_x_at ! #$VAR1 = { ! # 'others_mean' => '14165.1471730769', ! # 'pvalue' => '1.16199434367298e-10', ! # 'mean' => '6678.66514285714', ! # 'counts' => 33 ! # }; ! # where does 33 come from? Also, is that an average p-value? I need to debug where this structure is coming from! ! #print "The probe: 202189_x_at\n"; #202189_x_at ! #print Dumper $top_profiles->{$hc}{'202189_x_at'}; exit; ! # FIXME: the count problem was isolated to the parsing script and should be fixed ! ! my $sample_mean = $exp_values->{$file}{$probe}; #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n"; *************** *** 127,133 **** # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression # then the others ! #if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) { ! if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) { #print " Yes this is positive for $hc\n"; $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; } else { --- 154,162 ---- # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression # then the others ! if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2 && $top_profiles->{$hc}{$probe}{mean} > $top_profiles->{$hc}{$probe}{others_mean}) { ! #if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) { #print " Yes this is positive for $hc\n"; + $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; + } elsif (defined($sample_mean) && abs($sample_mean/$score_mean) <= 0.5 && $top_profiles->{$hc}{$probe}{mean} < $top_profiles->{$hc}{$probe}{others_mean}) { $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; } else { Index: ReadVGLOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** ReadVGLOutput.pm 23 Feb 2006 22:11:58 -0000 1.8 --- ReadVGLOutput.pm 28 Feb 2006 07:36:42 -0000 1.9 *************** *** 9,12 **** --- 9,15 ---- # FIXME: doesn't work with multiple subdirs + # DEBUG + my $foo = {}; + =head2 process *************** *** 123,128 **** foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) { $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info}; ! $results->{$hc}{$probe}{counts}++; } } } --- 126,133 ---- foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) { $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info}; ! # FIXME: is there a reason the counts might not be accurate here???!?!?! ! #$results->{$hc}{$probe}{counts}++; } + $results->{$hc}{$probe}{counts}++; } } *************** *** 160,164 **** } ! sub _parse_vgl_output { my ($self, $file, $offset) = @_; --- 165,170 ---- } ! # FIXME: the program that makes the VGL output reverses the order of one of the output ! # columns, make sure this is taking into account when the following code is run sub _parse_vgl_output { my ($self, $file, $offset) = @_; *************** *** 187,195 **** my $pvalue = 0; my $mean = 0; my $true_count = 0; if ($tokens[97-$offset] eq "TRUE") { $HC = $classification->{97}; $pvalue = $tokens[96-$offset]; ! $mean = $tokens[93-$offset]; $true_count++; } if ($tokens[104-$offset] eq "TRUE") { --- 193,206 ---- my $pvalue = 0; my $mean = 0; + my $others_mean = 0; my $true_count = 0; if ($tokens[97-$offset] eq "TRUE") { $HC = $classification->{97}; $pvalue = $tokens[96-$offset]; ! # BUG: the column changes here, the first entry is reversed where the mean of 1A is first and ! # the second column 1B_2A_2B mean ! #$mean = $tokens[93-$offset]; ! $mean = $tokens[92-$offset]; ! $others_mean = $tokens[93-$offset]; $true_count++; } if ($tokens[104-$offset] eq "TRUE") { *************** *** 197,200 **** --- 208,212 ---- $pvalue = $tokens[103-$offset]; $mean = $tokens[100-$offset]; + $others_mean = $tokens[99-$offset]; $true_count++; } if ($tokens[111-$offset] eq "TRUE") { *************** *** 202,209 **** --- 214,223 ---- $pvalue = $tokens[110-$offset]; $mean = $tokens[107-$offset]; + $others_mean = $tokens[106-$offset]; $true_count++; } if ($tokens[118-$offset] eq "TRUE") { $HC = $classification->{118}; $pvalue = $tokens[117-$offset]; + $others_mean = $tokens[116-$offset]; $mean = $tokens[114-$offset]; $true_count++; *************** *** 219,223 **** #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1; $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; ! $result->{$HC}{$tokens[0]}{mean} = $mean } } --- 233,243 ---- #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1; $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; ! $result->{$HC}{$tokens[0]}{mean} = $mean; ! $result->{$HC}{$tokens[0]}{others_mean} = $others_mean; ! if ($mean > $others_mean) { ! #print "Mean of $mean is greater than others mean of $others_mean for $tokens[0]\n"; ! } else { ! #print "Mean of $mean is less than others mean of $others_mean for $tokens[0]\n"; ! } } } |
|
From: <all...@su...> - 2006-02-27 07:58:11
|
Update of /cvsroot/libnelson/java In directory sumo.genetics.ucla.edu:/tmp/cvs-serv24819 Modified Files: gecCel.java Log Message: add run, user, experiment ids Index: gecCel.java =================================================================== RCS file: /cvsroot/libnelson/java/gecCel.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** gecCel.java 17 Feb 2006 01:45:32 -0000 1.2 --- gecCel.java 27 Feb 2006 07:58:07 -0000 1.3 *************** *** 17,22 **** public class gecCel { public static void main(String[] argv) { ! String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'"; ! // String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref! _id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?"; --- 17,21 ---- public class gecCel { public static void main(String[] argv) { ! String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id"; // String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref! _id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?"; *************** *** 33,41 **** System.out.println("<files>"); while(rs1.next()) { ! int chip_id = rs1.getInt("CHIP_ID"); ! String file_id = rs1.getString("FILE_ID"); ! int sample_id = rs1.getInt("SAMPLE_ID"); ! System.out.println(" <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\">"); stmt2 = conn.prepareStatement( --- 32,50 ---- System.out.println("<files>"); while(rs1.next()) { ! int chip_id = rs1.getInt("CHIP_ID"); ! int sample_id = rs1.getInt("SAMPLE_ID"); ! int run_id = rs1.getInt("RUN_ID"); ! int exp_id = rs1.getInt("EXPERIMENT_ID"); ! int user_id = rs1.getInt("USER_ID"); ! String file_id = rs1.getString("FILE_ID"); ! System.out.println(" <file run_id=\"" + run_id + "\" " + ! "file_id=\"" + file_id + "\" " + ! "chip_id=\"" + chip_id + "\" " + ! "sample_id=\"" + sample_id + "\" " + ! "user_id=\"" + user_id + "\" " + ! "run_id=\"" + run_id + "\" " + ! "experiment_id=\"" + exp_id + "\" " + ! ">"); stmt2 = conn.prepareStatement( |
|
From: <boc...@su...> - 2006-02-23 22:12:13
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/conf In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/conf Modified Files: classification_with_profiles.xml original_glioma_classification_with_vgl.xml original_glioma_expanded_phenotypes_pipe.xml original_glioma_vgl_parsing_pipe.xml p53_breast_cancer_data_Miller_et_al_2005.xml t-cell_leukemia_data_Soulier_et_al_2005.xml Log Message: Updates Index: t-cell_leukemia_data_Soulier_et_al_2005.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/t-cell_leukemia_data_Soulier_et_al_2005.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** t-cell_leukemia_data_Soulier_et_al_2005.xml 22 Feb 2006 09:12:24 -0000 1.1 --- t-cell_leukemia_data_Soulier_et_al_2005.xml 23 Feb 2006 22:11:58 -0000 1.2 *************** *** 1,5 **** <!-- Variables that are used throughout --> [% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %] ! [% cutoff_for_stability_percent = 10 %] [% dirs = ['90'] %] [% dir_str = '90' %] --- 1,5 ---- <!-- Variables that are used throughout --> [% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %] ! [% cutoff_for_stability_percent = 6 %] [% dirs = ['90'] %] [% dir_str = '90' %] *************** *** 71,75 **** <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> --- 71,75 ---- <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> *************** *** 112,116 **** [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> ! <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> --- 112,116 ---- [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> ! <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> *************** *** 124,128 **** </step> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> --- 124,128 ---- </step> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> *************** *** 140,144 **** visualiseTriplet.pl output.--> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> --- 140,144 ---- visualiseTriplet.pl output.--> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> *************** *** 158,162 **** <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> --- 158,162 ---- <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> Index: classification_with_profiles.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/classification_with_profiles.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** classification_with_profiles.xml 22 Feb 2006 09:12:23 -0000 1.1 --- classification_with_profiles.xml 23 Feb 2006 22:11:58 -0000 1.2 *************** *** 9,13 **** <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> --- 9,13 ---- <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> *************** *** 48,52 **** [% index = index + 1 %] <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> --- 48,52 ---- [% index = index + 1 %] <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> *************** *** 66,70 **** </step> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples"/> --- 66,70 ---- </step> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples"/> *************** *** 80,84 **** </step> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> --- 80,84 ---- </step> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> Index: original_glioma_expanded_phenotypes_pipe.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_expanded_phenotypes_pipe.xml,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** original_glioma_expanded_phenotypes_pipe.xml 16 Feb 2006 08:19:59 -0000 1.7 --- original_glioma_expanded_phenotypes_pipe.xml 23 Feb 2006 22:11:58 -0000 1.8 *************** *** 55,59 **** <!-- Code that parses the Voting Gene List output --> ! [%# INCLUDE original_glioma_vgl_parsing_pipe.xml %] <!-- --- 55,59 ---- <!-- Code that parses the Voting Gene List output --> ! [% INCLUDE original_glioma_vgl_parsing_pipe.xml %] <!-- *************** *** 237,241 **** (maybe I can work with Barry on a statistical technique that doesnât require random sampling) --> ! [% INCLUDE search_for_tf_binding_sites.xml %] --- 237,241 ---- (maybe I can work with Barry on a statistical technique that doesnât require random sampling) --> ! [%# INCLUDE search_for_tf_binding_sites.xml %] *************** *** 248,252 **** <!-- Perform the classification based on the vgl from Marc --> ! [%# INCLUDE original_glioma_classification_with_vgl.xml %] <!-- Collect some statistics on stability and U score --> --- 248,252 ---- <!-- Perform the classification based on the vgl from Marc --> ! [% INCLUDE original_glioma_classification_with_vgl.xml %] <!-- Collect some statistics on stability and U score --> Index: original_glioma_classification_with_vgl.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_vgl.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** original_glioma_classification_with_vgl.xml 8 Nov 2005 02:13:47 -0000 1.1 --- original_glioma_classification_with_vgl.xml 23 Feb 2006 22:11:58 -0000 1.2 *************** *** 5,9 **** [% i = 1 %] [% FOREACH dir = dirs %] ! <step id="48" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL"> <input> <item id="list_of_files" value="data/[% datadir %]/111_glioma_classification/to_classify_file_list.txt"/> --- 5,9 ---- [% i = 1 %] [% FOREACH dir = dirs %] ! <step id="48" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL"> <input> <item id="list_of_files" value="data/[% datadir %]/111_glioma_classification/to_classify_file_list.txt"/> *************** *** 20,24 **** </output> </step> ! <step id="48.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="vgl_scores_for_samples"/> --- 20,24 ---- </output> </step> ! <step id="48.2" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="vgl_scores_for_samples"/> Index: p53_breast_cancer_data_Miller_et_al_2005.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/p53_breast_cancer_data_Miller_et_al_2005.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** p53_breast_cancer_data_Miller_et_al_2005.xml 22 Feb 2006 09:12:24 -0000 1.1 --- p53_breast_cancer_data_Miller_et_al_2005.xml 23 Feb 2006 22:11:58 -0000 1.2 *************** *** 1,5 **** <!-- Variables that are used throughout --> [% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %] ! [% cutoff_for_stability_percent = 10 %] [% dirs = ['75'] %] [% dir_str = '75' %] --- 1,5 ---- <!-- Variables that are used throughout --> [% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %] ! [% cutoff_for_stability_percent = 6 %] [% dirs = ['75'] %] [% dir_str = '75' %] *************** *** 74,78 **** <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> --- 74,78 ---- <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> *************** *** 115,119 **** [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> ! <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> --- 115,119 ---- [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> ! <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> *************** *** 127,131 **** </step> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> --- 127,131 ---- </step> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> *************** *** 143,147 **** visualiseTriplet.pl output.--> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> --- 143,147 ---- visualiseTriplet.pl output.--> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> *************** *** 161,165 **** <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> --- 161,165 ---- <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> Index: original_glioma_vgl_parsing_pipe.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_vgl_parsing_pipe.xml,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** original_glioma_vgl_parsing_pipe.xml 8 Nov 2005 02:13:48 -0000 1.1 --- original_glioma_vgl_parsing_pipe.xml 23 Feb 2006 22:11:58 -0000 1.2 *************** *** 13,16 **** --- 13,18 ---- <item id="col_offset" value="19"/> <!-- was 12 before, was that wrong? --> <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/> + <item id="parser_type" value="4_columns"/> + <item id="input_file_name" value="All_DChip_expression_vals_longNames_groupTtest.xls"/> </input> <output> *************** *** 25,28 **** --- 27,31 ---- <item id="col_offset" value="32"/> <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/> + <item id="parser_type" value="2_phenotypes"/> </input> <output> *************** *** 37,40 **** --- 40,44 ---- <item id="col_offset" value="52"/> <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/> + <item id="parser_type" value="2_phenotypes"/> </input> <output> |
|
From: <boc...@su...> - 2006-02-23 22:12:11
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/data In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/data Modified Files: classification_scatterplot.sxc Log Message: Updates Index: classification_scatterplot.sxc =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/data/classification_scatterplot.sxc,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 Binary files /tmp/cvsWl4QLN and /tmp/cvsUrYDrl differ |
|
From: <boc...@su...> - 2006-02-23 22:12:11
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/scripts Added Files: pull_out_top_profiles.pl Log Message: Updates --- NEW FILE: pull_out_top_profiles.pl --- use strict; use Data::Dumper; my $count = shift; my $data = {}; while(<STDIN>) { chomp; my @tokens = split /\t/; $data->{$tokens[7]}{$tokens[0]} = 1; } my @stability = sort {$a <=> $b } keys %{$data}; my $i = 0; while ($i < $count) { my $stab = pop @stability; print "Stability = $stab\n"; print Dumper $data->{$stab}; $i++; } |
|
From: <boc...@su...> - 2006-02-23 22:12:11
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/lib/perl/Nelson/Pipe/Container/Job Modified Files: ReadVGLOutput.pm ScoreSamplesViaVGL.pm Log Message: Updates Index: ScoreSamplesViaVGL.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ScoreSamplesViaVGL.pm 22 Feb 2006 09:14:59 -0000 1.5 --- ScoreSamplesViaVGL.pm 23 Feb 2006 22:11:59 -0000 1.6 *************** *** 67,71 **** # now loop through each profile cutoff and score each file foreach my $profile_count_cutoff (@profile_count_cutoffs) { ! open SUMMARY, ">".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt" or die; if ($profile_count_cutoff ne 'all') { # read the profiles to test with here --- 67,71 ---- # now loop through each profile cutoff and score each file foreach my $profile_count_cutoff (@profile_count_cutoffs) { ! open SUMMARY, ">".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt" or die "Can't open ".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt for writing\n"; if ($profile_count_cutoff ne 'all') { # read the profiles to test with here *************** *** 124,128 **** if (!defined ($sample_mean )) { die "Sample Mean not defined for $file $probe\n"; } ! if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) { #print " Yes this is positive for $hc\n"; $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; --- 124,132 ---- if (!defined ($sample_mean )) { die "Sample Mean not defined for $file $probe\n"; } ! # BUG: I think the next line of code was incorrect!! ! # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression ! # then the others ! #if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) { ! if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) { #print " Yes this is positive for $hc\n"; $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1; Index: ReadVGLOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** ReadVGLOutput.pm 22 Feb 2006 09:14:59 -0000 1.7 --- ReadVGLOutput.pm 23 Feb 2006 22:11:58 -0000 1.8 *************** *** 61,65 **** $results->{frequency}{$subdir} = {}; $results->{comparison_to_reference}{$subdir} = {}; - # FIXME my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results"); --- 61,64 ---- *************** *** 68,72 **** my $curr_output; if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); } ! else { $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); } # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); --- 67,75 ---- my $curr_output; if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); } ! else { ! $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); ! #print "HERE!!!!".Dumper $curr_output; ! #die; ! } # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); |
|
From: <boc...@su...> - 2006-02-22 09:15:37
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/lib/perl/Nelson/Pipe/Container/Job Modified Files: CreateProfiles.pm PPLARunner.pm PickTopProfiles.pm ReadProfileOutput.pm ReadVGLOutput.pm ScoreSamplesViaProfiles.pm ScoreSamplesViaVGL.pm SifFileParser.pm SummarizeClassification.pm Added Files: CreateProfiles_2.pm Log Message: Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts Index: ScoreSamplesViaProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaProfiles.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ScoreSamplesViaProfiles.pm 27 Sep 2005 22:54:38 -0000 1.5 --- ScoreSamplesViaProfiles.pm 22 Feb 2006 09:14:59 -0000 1.6 *************** *** 28,32 **** # for each file in the test set my @files; ! open LIST, $list_of_files or die; while(<LIST>) { chomp; --- 28,32 ---- # for each file in the test set my @files; ! open LIST, $list_of_files or die "Can't open $list_of_files\n"; while(<LIST>) { chomp; Index: PickTopProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PickTopProfiles.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** PickTopProfiles.pm 19 Aug 2005 23:17:09 -0000 1.5 --- PickTopProfiles.pm 22 Feb 2006 09:14:58 -0000 1.6 *************** *** 31,35 **** my @subdirs = split /,/, $self->{profile_sub_dirs}; ! my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})}; foreach my $subdir (@subdirs) { --- 31,35 ---- my @subdirs = split /,/, $self->{profile_sub_dirs}; ! #my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})}; foreach my $subdir (@subdirs) { *************** *** 42,46 **** foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) { foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) { ! #print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n"; if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) { ##my $cache = $data->{all_profiles}; # done for memory issues --- 42,46 ---- foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) { foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) { ! print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n"; if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) { ##my $cache = $data->{all_profiles}; # done for memory issues *************** *** 79,83 **** my ($self, $file) = @_; my @results; ! open PROFILES, $file or die; while(<PROFILES>) { chomp; --- 79,83 ---- my ($self, $file) = @_; my @results; ! open PROFILES, $file or die "Can't open file $file\n"; while(<PROFILES>) { chomp; --- NEW FILE: CreateProfiles_2.pm --- package Nelson::Pipe::Container::Job::CreateProfiles_2; use base qw(Nelson::Pipe::Container::Job); use strict; use Data::Dumper; use Storable; =head2 process Title : process Usage : Function: Example : Returns : Args : =cut sub process{ my ($self,$input,$output,$stash) = @_; my $file_list_dir = $input->{file_list_dir}{value}; my $file_list_sub_dir = $input->{file_list_sub_dir}{value}; my $mas5_dir = $input->{mas5_dir}{value}; my $profile_dir = $output->{profile_dir}{value}; my $sif_file_hash_name = $input->{sif_file_hash}{value}; my $sif_file_hash = $stash->{$sif_file_hash_name}; my $start = $input->{start}{value}; my $end = $input->{end}{value}; my $pre_cache_mas5 = $input->{pre_cache_mas5}{value}; my $mas5_cache_output = $output->{mas5_cache_output}{value}; # try to load the sif_file_hash if undef if (!defined($sif_file_hash)) { $sif_file_hash = retrieve($self->{sif_file_hash_storable}); } # pre-cache the mas5 results $self->{mas5_cache} = {}; if ($pre_cache_mas5 eq '1') { $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir); } my @sub_dirs = split /,/, $file_list_sub_dir; foreach my $sub_dir (@sub_dirs) { system("mkdir -p $profile_dir/$sub_dir"); my @files = glob("$file_list_dir/$sub_dir/*.txt"); foreach my $list_of_cel_files (@files) { $list_of_cel_files =~ /_(\d+)\.txt$/; my $file_num = $1; if($file_num >= $start && $file_num < $end) { $self->_process_cel_files($list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_file_hash); } } } if ($pre_cache_mas5 eq '1' and defined($mas5_cache_output)) { store $self->{mas5_cache}, $mas5_cache_output; } } sub _pre_cache_mas5 { my ($self, $mas5_dir) = @_; my $cache = {}; my @files = glob("$mas5_dir/*.txt"); foreach my $file (@files) { open FILE, "<$file" or die "can't open $file"; $file =~ /\/([^\/]+)\.txt$/; my $filename = $1; push @{$cache->{filenames}}, $filename; $file =~ s/ /_/g; $file =~ s/#/_/g; while(<FILE>) { my @tokens = split /\t/; if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) { #print "P-value: $tokens[5]\n"; exit; $tokens[4] =~ /([PAM])/; if ($tokens[5] <= 0.05) { ##if ($1 eq 'P') { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1; push @{$cache->{probes}{$tokens[0]}}, 1; } else { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0; push @{$cache->{probes}{$tokens[0]}}, 0; } } elsif ($self->{parse_old_mas5} == 0) { #print "New pvalue: $tokens[1]\n"; exit; $tokens[2] =~ /([PAM])/; if ($tokens[1] <= 0.05) { ##if ($1 eq 'P') { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1; push @{$cache->{probes}{$tokens[0]}}, 1; } else { #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0; push @{$cache->{probes}{$tokens[0]}}, 0; } } } close FILE; #print Dumper($cache); exit; } return($cache); } sub _process_cel_files { my ($self, $list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_hash) = @_; my $results = {}; $list_of_cel_files =~ /file_list_(\d+)\.txt/; my $file_num = $1; open LIST, "<$list_of_cel_files" or die; my $i = 0; while(<LIST>) { chomp; my $filename = $_; my $old_filename = $filename; $filename =~ s/ /_/g; $filename =~ s/#/_/g; push @{$results->{samples}{names}}, $filename; #$results->{samples}{names}{$filename} = 1; print "Getting Annotations for $filename\n"; $self->_get_annotations($filename, $old_filename, $results, $sif_hash); #print "FOO ".Dumper($results); exit; print "Parsing File $filename\n"; $self->_parse_file($old_filename, $results, $mas5_dir); #print "On file $i $filename\n"; $i++; } close LIST; # at this point all the P/A calls are loaded for all the files # in the list, next print it out print "Printing profile\n"; my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; $self->_print_profile($results, $outfile); } sub _get_annotations { my ($self, $filename, $old_filename, $results, $sif_hash) = @_; #print Dumper($sif_hash); exit; # HC if ($sif_hash->{'files'}{$old_filename}{hc} =~ /1A/) { $results->{samples}{annotations}{HC_1A}{$filename} = 1; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /1B/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 1; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2A/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 1; $results->{samples}{annotations}{HC_2B}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2B/) { $results->{samples}{annotations}{HC_1A}{$filename} = 0; $results->{samples}{annotations}{HC_1B}{$filename} = 0; $results->{samples}{annotations}{HC_2A}{$filename} = 0; $results->{samples}{annotations}{HC_2B}{$filename} = 1; } # tumor type if ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /mixed/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 1; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /gbm/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 1; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /oligo/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 1; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /astro/) { $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0; $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0; $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0; $results->{samples}{annotations}{tumor_type_astro}{$filename} = 1; } # sex if ($sif_hash->{'files'}{$old_filename}{sex} =~ /F/) { $results->{samples}{annotations}{sex_f}{$filename} = 1; $results->{samples}{annotations}{sex_m}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{sex} =~ /M/) { $results->{samples}{annotations}{sex_f}{$filename} = 0; $results->{samples}{annotations}{sex_m}{$filename} = 1; } # grade if ($sif_hash->{'files'}{$old_filename}{grade} =~ /3/) { $results->{samples}{annotations}{grade_3}{$filename} = 1; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /4/) { $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 1; } # survival cluster if ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC1/) { $results->{samples}{annotations}{survival_cluster_1}{$filename} = 1; $results->{samples}{annotations}{survival_cluster_2}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC2/) { $results->{samples}{annotations}{survival_cluster_1}{$filename} = 0; $results->{samples}{annotations}{survival_cluster_2}{$filename} = 1; } # survival time # this is a bit more tricky because I want a profile for each survival time my $i=0; foreach my $survival_time (sort {$a <=> $b} keys %{$sif_hash->{'files_by_survival_time'}}) { $i++; if ($sif_hash->{'files'}{$old_filename}{survival_time} >= $survival_time) { $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 1; } else { $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 0; } } # These were added for the p53 Breast Cancer Dataset Miller et al 2005 # grade (grade 4 doesn't exist in this dataset!!) if ($sif_hash->{'files'}{$old_filename}{grade} =~ /G1/) { $results->{samples}{annotations}{grade_1}{$filename} = 1; $results->{samples}{annotations}{grade_2}{$filename} = 0; $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G2/) { $results->{samples}{annotations}{grade_1}{$filename} = 0; $results->{samples}{annotations}{grade_2}{$filename} = 1; $results->{samples}{annotations}{grade_3}{$filename} = 0; $results->{samples}{annotations}{grade_4}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G3/) { $results->{samples}{annotations}{grade_1}{$filename} = 0; $results->{samples}{annotations}{grade_2}{$filename} = 0; $results->{samples}{annotations}{grade_3}{$filename} = 1; $results->{samples}{annotations}{grade_4}{$filename} = 0; } # lymph_pos if ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /0/) { $results->{samples}{annotations}{lymph_pos}{$filename} = 0; $results->{samples}{annotations}{lymph_neg}{$filename} = 1; } elsif ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /1/) { $results->{samples}{annotations}{lymph_pos}{$filename} = 1; $results->{samples}{annotations}{lymph_neg}{$filename} = 0; } # er_wt if ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /1/) { $results->{samples}{annotations}{er_wt}{$filename} = 1; $results->{samples}{annotations}{er_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /0/) { $results->{samples}{annotations}{er_wt}{$filename} = 0; $results->{samples}{annotations}{er_mt}{$filename} = 1; } # pgr_wt if ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /1/) { $results->{samples}{annotations}{pgr_wt}{$filename} = 1; $results->{samples}{annotations}{pgr_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /0/) { $results->{samples}{annotations}{pgr_wt}{$filename} = 0; $results->{samples}{annotations}{pgr_mt}{$filename} = 1; } # dlda_mt if ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /1/) { $results->{samples}{annotations}{dlda_wt}{$filename} = 0; $results->{samples}{annotations}{dlda_mt}{$filename} = 1; } elsif ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /0/) { $results->{samples}{annotations}{dlda_wt}{$filename} = 1; $results->{samples}{annotations}{dlda_mt}{$filename} = 0; } # p53_wt if ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /1/) { $results->{samples}{annotations}{p53_wt}{$filename} = 1; $results->{samples}{annotations}{p53_mt}{$filename} = 0; } elsif ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /0/) { $results->{samples}{annotations}{p53_wt}{$filename} = 0; $results->{samples}{annotations}{p53_mt}{$filename} = 1; } } sub _print_profile { my ($self, $results, $outfile) = @_; open OUTFILE, ">$outfile" or die "Can't open outfile: $outfile\n"; #print OUTFILE "samples\t".join("\t", sort keys %{$results->{samples}{names}}),"\n"; print OUTFILE "samples\t".join("\t", @{$results->{samples}{names}}),"\n"; #print "FOO\n"; #print Dumper($results->{samples}); exit; #print Dumper($results->{samples}{annotations}); exit; foreach my $anno (sort keys %{$results->{samples}{annotations}}) { print OUTFILE "$anno\t"; my $curr_col = 0; my $total_col = scalar(keys %{$results->{samples}{annotations}{$anno}}); #foreach my $samp_file (sort keys %{$results->{samples}{annotations}{$anno}}) { foreach my $samp_file (@{$results->{samples}{names}}) { $curr_col++; print OUTFILE $results->{samples}{annotations}{$anno}{$samp_file}; if ($curr_col < $total_col) { print OUTFILE "\t"; } } print OUTFILE "\n"; } foreach my $probe (sort keys %{$results->{probes}}) { #print "Size: ".scalar(@{$results->{probes}{$probe}})."\n"; #print OUTFILE "$probe\t".join("\t", @{$results->{probes}{$probe}}),"\n"; print OUTFILE "$probe\t"; #foreach my $filename (sort keys %{$results->{probes}{$probe}}) { foreach my $value (@{$results->{probes}{$probe}}) { #print OUTFILE $results->{probes}{$probe}{$filename}{'call'}, "\t"; print OUTFILE "$value\t"; } print OUTFILE "\n"; } close OUTFILE; } sub _parse_file { my ($self, $file, $results, $mas5_dir) = @_; if (defined($self->{mas5_cache}{probes})) { # then the cache is used # find the offset for this file #print "The file is: $file\n"; my $curr_index = 0; my $index = 0; foreach my $filename (@{$self->{mas5_cache}{filenames}}) { #print " other filename: $filename\n"; if ($filename eq $file) { $index = $curr_index; last; } $curr_index++; } #print "The index is: $index\n"; # now iterate over and copy calls to results structure foreach my $probe (keys %{$self->{mas5_cache}{probes}}) { #$results->{probes}{$probe}{$file}{'call'} = $self->{mas5_cache}{probes}{$probe}[$index]; push @{$results->{probes}{$probe}}, $self->{mas5_cache}{probes}{$probe}[$index]; } } else { # FIXME: I don't think this will work anymore since moved to arrays rather than hashes open FILE, "<$mas5_dir/$file.txt" or die "can't open $mas5_dir/$file"; my $filename = $file; $file =~ s/ /_/g; $file =~ s/#/_/g; while(<FILE>) { my @tokens = split /\t/; if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) { #print "P-value: $tokens[5]\n"; exit; $tokens[4] =~ /([PAM])/; if ($tokens[5] <= 0.05) { ##if ($1 eq 'P') { $results->{probes}{$tokens[0]}{$file}{'call'} = 1; } else { $results->{probes}{$tokens[0]}{$file}{'call'} = 0; } } elsif ($self->{parse_old_mas5} == 0) { #print "New pvalue: $tokens[1]\n"; exit; $tokens[2] =~ /([PAM])/; if ($tokens[1] <= 0.05) { ##if ($1 eq 'P') { $results->{probes}{$tokens[0]}{$file}{'call'} = 1; } else { $results->{probes}{$tokens[0]}{$file}{'call'} = 0; } } } close FILE; } } Index: SifFileParser.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SifFileParser.pm,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** SifFileParser.pm 17 Feb 2006 00:09:22 -0000 1.9 --- SifFileParser.pm 22 Feb 2006 09:15:00 -0000 1.10 *************** *** 25,29 **** my $samples; ! if ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); } else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); } $stash->{$output_hash_name} = $samples; --- 25,31 ---- my $samples; ! ! if ($sif_format eq 'simple') { $samples = $self->_read_simple_sample_list($sif_file); } ! elsif ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); } else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); } $stash->{$output_hash_name} = $samples; *************** *** 130,133 **** --- 132,156 ---- my $final_output = {'files_by_hc' => $self->{files_by_hc}, 'files' => $self->{files}, 'files_by_survival_time' => $self->{files_by_survival_time}}; return($final_output); + } + + # this just reads a three column tab file used by Marc's (Bin's) program + # that links filename (without extension) to phenotype. It's only useful + # for linking files to one phenotype class at a time + sub _read_simple_sample_list { + my ($self, $input_sample_list) = @_; + open INPUT, "<$input_sample_list" or die; + while (<INPUT>) { + chomp; + my @tokens = split /\t/; + my $filename = $tokens[1]; + my $pheno = $tokens[2]; + if ($pheno eq 'TAL_R') { $self->{files}{$filename}{TAL_R} = 1; } + else { $self->{files}{$filename}{TAL_R} = 0; } + } + close INPUT; + + my $final_output = {'files' => $self->{files}}; + return($final_output); + } Index: CreateProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** CreateProfiles.pm 17 Feb 2006 00:09:22 -0000 1.15 --- CreateProfiles.pm 22 Feb 2006 09:14:57 -0000 1.16 *************** *** 28,31 **** --- 28,32 ---- my $end = $input->{end}{value}; my $pre_cache_mas5 = $input->{pre_cache_mas5}{value}; + my $no_overwrite = $input->{no_overwrite}{value}; my $mas5_cache_output = $output->{mas5_cache_output}{value}; *************** *** 115,118 **** --- 116,121 ---- $list_of_cel_files =~ /file_list_(\d+)\.txt/; my $file_num = $1; + my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; + if ($self->{no_overwrite} && -f "$profile_dir/$sub_dir/file_list_$file_num.profile") { return; } open LIST, "<$list_of_cel_files" or die; my $i = 0; *************** *** 136,140 **** # in the list, next print it out print "Printing profile\n"; - my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile"; $self->_print_profile($results, $outfile); } --- 139,142 ---- *************** *** 297,300 **** --- 299,310 ---- } + # These were added for the T-cell leukemia dataset Soulier et al 2005 + if ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /1/) { + $results->{samples}{annotations}{TAL_R}{$filename} = 1; + $results->{samples}{annotations}{HOX_R}{$filename} = 0; + } elsif ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /0/) { + $results->{samples}{annotations}{TAL_R}{$filename} = 0; + $results->{samples}{annotations}{HOX_R}{$filename} = 1; + } } Index: ReadVGLOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ReadVGLOutput.pm 31 Oct 2005 19:02:21 -0000 1.6 --- ReadVGLOutput.pm 22 Feb 2006 09:14:59 -0000 1.7 *************** *** 24,30 **** --- 24,36 ---- my $profile_output_dir = $self->{profile_output_dir}; my $profile_output_sub_dirs = $self->{profile_output_sub_dirs}; + my $compare_to_reference = $self->{compare_to_reference}; my $reference_profile = $self->{reference_profile}; my $output_dir = $self->{output_dir}; my $col_offset = $self->{col_offset}; + my $parser_type = $self->{parser_type}; + my $base_col = $self->{base_col}; + my $pheno_str_1 = $self->{pheno_str_1}; + my $pheno_str_2 = $self->{pheno_str_2}; + my $input_file_name = $self->{input_file_name}; # the hash that holds hash and cache data *************** *** 36,41 **** # the reference sample ! my $reference = $self->_parse_vgl_output($reference_profile, 0); ! $results->{parsed_output}{'100'}{reference} = $reference; # comment out, these may have already been created --- 42,50 ---- # the reference sample ! my $reference; ! if ($compare_to_reference) { ! $reference = $self->_parse_vgl_output($reference_profile, 0); ! $results->{parsed_output}{'100'}{reference} = $reference; ! } # comment out, these may have already been created *************** *** 56,61 **** my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results"); foreach my $file (@files) { ! print "Curr Profile: $file/All_DChip_expression_vals_longNames_groupTtest.xls\n"; ! my $curr_output = $self->_parse_vgl_output("$file/All_DChip_expression_vals_longNames_groupTtest.xls", $col_offset); # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); --- 65,72 ---- my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results"); foreach my $file (@files) { ! print "Curr Profile: $file/$input_file_name\n"; ! my $curr_output; ! if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); } ! else { $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); } # this is used to store a count/lines for all profiles encountered in this particular subdir $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir}); *************** *** 64,68 **** ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } --- 75,79 ---- ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! if ($compare_to_reference) { $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); } $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } *************** *** 75,78 **** --- 86,90 ---- # DEBUG + print "DEBUG DUMP!\n"; print Dumper($results->{comparison_to_reference}); print Dumper($results->{frequency}); *************** *** 113,116 **** --- 125,160 ---- } } + + # only one column needs to be examined when there is one phenotype with two states + # FIXME: the phenotype is hardcoded here!!! + sub _parse_2_pheno_vgl_output { + my ($self, $file, $offset, $base, $pheno_str_1, $pheno_str_2) = @_; + open INPUT, "<$file" or die "Can't open file: $file\n"; + my $result; + while(<INPUT>) { + if (!/^probe.set/) { + chomp; + my @tokens = split /\t/; + my $HC = ""; + my $pvalue = $tokens[$base+5]; + my $mean = $tokens[$base+3]; + my $true_count = 0; + if ($tokens[$base+6] eq "TRUE" && $tokens[$base+3] >= 0) { + $HC = $pheno_str_1; + $true_count++; + } elsif ($tokens[$base+6] eq "TRUE" && $tokens[$base+3] < 0) { + $HC = $pheno_str_2; + $true_count++; + } + if ($true_count == 1) { + $result->{$HC}{$tokens[0]}{pvalue} = $pvalue; + $result->{$HC}{$tokens[0]}{mean} = $mean + } + } + } + close INPUT; + return($result); + } + sub _parse_vgl_output { Index: SummarizeClassification.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SummarizeClassification.pm,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** SummarizeClassification.pm 5 Oct 2005 00:04:20 -0000 1.5 --- SummarizeClassification.pm 22 Feb 2006 09:15:00 -0000 1.6 *************** *** 64,68 **** my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation}; print "Highest: $highest_phenotype Correct: $correct_annotation\n"; ! if ($highest_phenotype =~ /$correct_annotation/) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; print "CORRECT!!!\n"; } --- 64,68 ---- my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation}; print "Highest: $highest_phenotype Correct: $correct_annotation\n"; ! if ($highest_phenotype =~ /$correct_annotation/i) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; print "CORRECT!!!\n"; } Index: PPLARunner.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PPLARunner.pm,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** PPLARunner.pm 8 Dec 2005 02:30:46 -0000 1.7 --- PPLARunner.pm 22 Feb 2006 09:14:58 -0000 1.8 *************** *** 21,24 **** --- 21,25 ---- my $start = $input->{start}{value}; my $end = $input->{end}{value}; + my $no_overwrite = $input->{no_overwrite}{value}; foreach my $subdir (split /,/, $self->{profiles_sub_dirs}) { *************** *** 26,30 **** print "DIRECTORY: ".$self->{profiles_dir}."/$subdir\n"; foreach my $file (@files) { - $file =~ /_(\d+)\.profile$/; my $file_num = $1; --- 27,30 ---- *************** *** 34,41 **** $file =~ /\/(\w+).profile$/; my $filename = $1; my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"; print STDERR "$command\n"; - system($command); system("mkdir -p ".$self->{output_dir}."/".$subdir); # FIXME: what do I need to do to get logging working!?!? #$self->log("PPLARunner", $command); --- 34,42 ---- $file =~ /\/(\w+).profile$/; my $filename = $1; + next if ($no_overwrite && -f $self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"); my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output"; print STDERR "$command\n"; system("mkdir -p ".$self->{output_dir}."/".$subdir); + system($command); # FIXME: what do I need to do to get logging working!?!? #$self->log("PPLARunner", $command); Index: ScoreSamplesViaVGL.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** ScoreSamplesViaVGL.pm 31 Oct 2005 19:02:21 -0000 1.4 --- ScoreSamplesViaVGL.pm 22 Feb 2006 09:14:59 -0000 1.5 *************** *** 28,31 **** --- 28,32 ---- my $exp_values = $self->_read_exp_values($self->{exp_values}); + #print Dumper($exp_values); exit; # for each file in the test set *************** *** 144,148 **** while(<IN>) { chomp; ! if (/^probe_set/) { @files = split /\t/; shift @files; --- 145,149 ---- while(<IN>) { chomp; ! if (/^probe.set/) { @files = split /\t/; shift @files; Index: ReadProfileOutput.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadProfileOutput.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** ReadProfileOutput.pm 8 Dec 2005 02:30:46 -0000 1.8 --- ReadProfileOutput.pm 22 Feb 2006 09:14:59 -0000 1.9 *************** *** 23,26 **** --- 23,27 ---- my $profile_output_sub_dirs = $self->{profile_output_sub_dirs}; my $reference_profile = $self->{reference_profile}; + my $compare_to_reference = $self->{compare_to_reference}; my $output_dir = $self->{output_dir}; my @profiles = split /,/, $self->{profiles_to_count}; *************** *** 30,35 **** # the reference sample ! my $reference = $self->_parse_ppla_output($reference_profile, \@profiles); ! $results->{parsed_output}{'100'}{reference} = $reference; $Data::Dumper::Indent = 1; --- 31,39 ---- # the reference sample ! my $reference; ! if ($compare_to_reference) { ! $reference = $self->_parse_ppla_output($reference_profile, \@profiles); ! $results->{parsed_output}{'100'}{reference} = $reference; ! } $Data::Dumper::Indent = 1; *************** *** 66,70 **** $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } --- 70,74 ---- $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set ! if ($compare_to_reference) { $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); } $self->_tally_results($curr_output, $results->{frequency}{$subdir}); } *************** *** 76,81 **** # DEBUG #print Dumper($results->{comparison_to_reference}); ! #print Dumper(keys %{$results->{frequency}{50}}); } --- 80,87 ---- # DEBUG + #print Dumper (keys %{$results}); #print Dumper($results->{comparison_to_reference}); ! print Dumper(keys %{$results->{frequency}{75}}); ! print Dumper($results->{frequency}{75}); } |
|
From: <boc...@su...> - 2006-02-22 09:15:07
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/data In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/data Added Files: classification_scatterplot.sxc Log Message: Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts --- NEW FILE: classification_scatterplot.sxc --- (This appears to be a binary file; contents omitted.) |
Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/scripts Added Files: count_vgl_output_columns.pl fix_testset_annotations.pl fix_testset_t-cell_annotations.pl mkdir_data_dir.sh test_row_length.pl Log Message: Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts --- NEW FILE: fix_testset_t-cell_annotations.pl --- #!/usr/local/bin/perl use Storable; my ($file_list, $sif_storable) = @ARGV; my $data = retrieve($sif_storable); open INPUT, $file_list or die; while(<INPUT>) { chomp; print "$_\t"; if ($data->{files}{$_}{TAL_R}) { print "TAL_R\n"; } else { print "HOX_R\n"; } } --- NEW FILE: mkdir_data_dir.sh --- #!/bin/sh mkdir $1 mkdir $1/hypergeometric mkdir $1/sif mkdir $1/analysis # this should actually be checked in and a symlink mkdir -p $1/analysis/templates mkdir $1/analysis/classifications mkdir $1/mas5 mkdir $1/ppla_output mkdir $1/ppla_output/100 mkdir $1/profiles mkdir $1/rand_file_lists mkdir $1/top_profiles mkdir $1/visualization mkdir $1/vgl_output # need to add sym link to dirs mkdir $1/vgl --- NEW FILE: test_row_length.pl --- use Data::Dumper; my $length = shift; $data = {}; while(<STDIN>) { my @tokens = split /\s+/; $data->{scalar(@tokens)}++; if (scalar(@tokens) != $length) { print scalar(@tokens)."\t".$_; } } print Dumper($data); --- NEW FILE: count_vgl_output_columns.pl --- while(<STDIN>) { chomp; my @tokens = split /\t/; $i=0; foreach $token (@tokens) { #if ($token =~ /DLDA_MTtoRest_DLDA_MT_vs_DLDA_WT/) { print "$token $i\n"; } if ($token =~ /selected/) { print "$token $i\n"; } $i++; } last; } --- NEW FILE: fix_testset_annotations.pl --- #!/usr/local/bin/perl use Storable; my ($file_list, $sif_storable) = @ARGV; my $data = retrieve($sif_storable); open INPUT, $file_list or die; while(<INPUT>) { chomp; print "$_\t"; if ($data->{files}{$_}{dlda_mt}) { print "dlda_mt\n"; } else { print "dlda_wt\n"; } } |
|
From: <boc...@su...> - 2006-02-22 09:15:07
|
Update of /cvsroot/dev-boconnor/project_logic_analysis In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351 Added Files: run_p53_breast_cancer_data_Miller_et_al_2005.sh run_t-cell_leukemia_data_Soulier_et_al_2005.sh Log Message: Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts --- NEW FILE: run_p53_breast_cancer_data_Miller_et_al_2005.sh --- #!/bin/sh perl -I./lib/perl -I/raid5a/boconnor/cvsroot/libnelson/lib -I/raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/lib /raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/runner.pl conf/p53_breast_cancer_data_Miller_et_al_2005.xml --- NEW FILE: run_t-cell_leukemia_data_Soulier_et_al_2005.sh --- #!/bin/sh perl -I./lib/perl -I/raid5a/boconnor/cvsroot/libnelson/lib -I/raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/lib /raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/runner.pl conf/t-cell_leukemia_data_Soulier_et_al_2005.xml |
|
From: <boc...@su...> - 2006-02-22 09:12:28
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/conf In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7915 Modified Files: make_profiles_and_run_la_include.xml original_glioma_classification_with_profiles.xml Added Files: classification_with_profiles.xml classification_with_vgl.xml p53_breast_cancer_data_Miller_et_al_2005.xml t-cell_leukemia_data_Soulier_et_al_2005.xml vgl_parsing_pipe.xml Log Message: Updates and additions to the conf files. I've tried to paramaterize these as much as possible --- NEW FILE: t-cell_leukemia_data_Soulier_et_al_2005.xml --- <!-- Variables that are used throughout --> [% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %] [% cutoff_for_stability_percent = 10 %] [% dirs = ['90'] %] [% dir_str = '90' %] [% percent_to_hold_for_testset = 20 %] [% num_profiles = 2 %] [% total_number_profiles = 100 %] [% times_to_repeat = 100 %] [% profile_block_size = 10 %] [% ppla_block_size = 10 %] [% index = 0 %] <project project_name="Project_Logic_Analysis" project_description="This project looks at understanding gene relationships using the logic analysis technique created by P. Bowers and T. Yeates. I've extended the technique to use microarray data." db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password=""> <pipe pipe_name="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe" pipe_desc="Tests the stability of networks using the t-cell leukemia dataset by Soulier et al." pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis" testing_mode="1"> <settings> <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/> <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/> <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner" version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis" tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/> <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher" publish_root_dir="/raid5a/boconnor/public_html/Projects" publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/> </settings> <initialization> <plugin name="Logger"/> <plugin name="SystemStateRecorder"/> <plugin name="Versioner"/> </initialization> <run> [%# this is an example of a comment %] <!-- Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108 --> [% sif_file = "data/t-cell_leukemia_data_Soulier_et_al_2005/sif/t-cell_sif_for_vgl.txt" %] [% file_map_file = "NA" %] [% phenotypes = "NA" %] [% col_ordering = "NA" %] [% sif_format = "simple" %] [% random_selection_technique = "across_all_samples" %] [% parse_old_mas5 = 0 %] [% profiles_to_count = "TAL_R,HOX_R" %] [% compare_to_reference = "0" %] [% INCLUDE make_profiles_and_run_la_include.xml %] <!-- Code that parses the Voting Gene List output --> [% index = index + 100 %] [% base_col = 81 %] <!-- this is the column that starts a result, it just has a column header and nothing in the rows --> [% input_file_name = "dChipExpr_Leukemia_groupTtest.xls" %] [% pheno_str_1 = "TAL_R" %] [% pheno_str_2 = "HOX_R" %] [%# INCLUDE vgl_parsing_pipe.xml %] <!-- Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of profiles and graph them out with graphviz. --> <!-- FIXME: It doesn't always find the profile in the reference set. Need to fix this!!! --> <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set --> <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> <item id="parsed_output_filename" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="profile_sub_dirs" value="[% dir_str %]"/> <item id="cutoff" value="[% cutoff_for_stability_percent %]"/> <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used --> </input> <output> <item id="output_dir" value="data/[% datadir %]/top_profiles"/> </output> </step> <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest --> <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/> <arg id="4" name="" value="> data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/> </processor_args> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/> </output> </step> [% index = index + 1 %] [% END %] <!-- This section calls Peter's code to calculate p-values based on hypergeometric dist. It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the visualizer to create an input to Peter's hypergeometric calculation. --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/> <arg id="4" name="" value="> data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> </processor_args> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> <arg id="2" name="" value="> data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/> </processor_args> </step> <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value) It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value" entry that stores the various p-value calculations done by Peter. This is used by FindMostConnectedNodes and visualiseTriplets.pl to annotate the results. This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable --> <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable --> <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the visualiseTriplet.pl output.--> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/> <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="subdir" value="[% dir %]"/> </input> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/> <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> </output> </step> [% index = index + 1 %] [% END %] <!-- now parse out the top profiles and collect some statistics on them --> <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> <item id="subdirs" value="[% dir_str %]"/> <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/> </input> <output> <item id="stash_output" value="stable_ppla_output_profiles"/> <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/> </output> </step> <!-- This step goes through the PPLA output parsed above and counts how many times a given probeset is included in a triplet relationship. It then summarizes this information into a hash and hands off the display of the information to a tt2. The output is an HTML document that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset. This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question to visualize the individual networks with html and png output. --> <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships --> <!-- FIXME: subdir is currently hardcoded inside this script!! --> <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes"> <input> <item id="stash_input" value="stable_ppla_output_profiles"/> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> <item id="subdir" value="[% dir%]"/> <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/> <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/> <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/> <item id="min_triplets" value="3"/> <item id="template_most_connected" value="index_for_connected_nodes.tt2"/> <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/> <item id="template_most_stable" value="index_for_connected_nodes.tt2"/> <item id="template_detailed" value="details_for_connected_nodes.tt2"/> <item id="template_dir" value="data/[% datadir %]/analysis/templates"/> <item id="profiles_to_count" value="[% profiles_to_count %]"/> </input> <output> <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/> </output> </step> [% index = index + 1 %] [% END %] <!-- Just creates a summary page at http://humerus/project_logic_analysis --> [% index = index + 1 %] <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/> <arg id="2" name="" value="> /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/> </processor_args> </step> <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than the generic annotation bias checker below --> <!-- FIXME: includes some hardcoded elements --> [%# INCLUDE oncogene_counts.xml %] <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time! The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each probeset. The output is directed to a visualization directory. --> [%# INCLUDE build_interesting_networks.xml %] <!-- A very simple series of scripts that 1) pull out a non-redundant list of probesets in the most stable logic triplets, 2) counts the number of probesets whos OMIM record contain one or more of a collection of keyword terms, 3) compare this to X number of random trials where the same number of probesets are selected randomly and the annotation bias is checked, and finally 4) treat these numbers of matching probesets for each trial as a normal random variable and compute a two-tailed p-value for the number of annotations on the original list of probesets. --> [%# INCLUDE compare_annotation_bias.xml %] <!-- # identify list of probesets of interest # for each probeset, identify genomic location via chado # extract upstream region of 2Kb # scan 2Kb region for known binding sites # identify factors binding these sites # repeat whole process X times with random lists of probesets and evaluate significance of results (maybe I can work with Barry on a statistical technique that doesnât require random sampling) --> [%# INCLUDE search_for_tf_binding_sites.xml %] <!-- Performs classification based on profiles (tiplet relationships). Take Z number of non-training set data and run it through a prediction process where the microarray data is converted to [1|0] and each profile is assessed. If it's valid then the score gets a 1 otherwise -1 and normalize on the number of profiles used for that HC. At the end there should be a score for each HC for a given sample, assign it to the HC with the highest score. --> [% profiles_to_count = "TAL_R,HOX_R" %] <!-- FIXME: this is redefined from above --> [% test_set_annotations_file = "test_set" %] [% index = index + 100 %] [% INCLUDE classification_with_profiles.xml %] <!-- Perform the classification based on the vgl from Marc --> <!-- All the inputs need to be defined here! --> [% index = index + 100 %] [% testset_w_annotations = "test_set_90_w_annotations.txt" %] [% exp_values = "data/t-cell_leukemia_data_Soulier_et_al_2005/dChipExpr_Leukemia.xls" %] [%# INCLUDE classification_with_vgl.xml %] <!-- LEFT OFF HERE --> <!-- Collect some statistics on stability and U score --> [%# INCLUDE original_glioma_statistics_on_stability.xml %] </run> <cleanup> <plugin name="Publisher"/> </cleanup> </pipe> </project> --- NEW FILE: classification_with_profiles.xml --- <!-- The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples, score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results will be summarized as correct or not and the overall predictive process will be scored. --> <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> </input> <output> <item id="stash_output" value="complete_ppla_input"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> <item id="subdirs" value="[% dir_str %]"/> <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/> </input> <output> <item id="stash_output" value="stable_ppla_output_profiles"/> </output> </step> <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. --> [% FOREACH dir = dirs %] [% index = index + 1 %] <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation. The problem is what annotation to use? In the brain tumor data it was simple: HC_1A...etc. Here I want to use dlda phenotype but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored) I wrote a quick script below to append the annotation onto the file. It should only be used when data is read for non-glioma datasets. This uses the data from SifFileParser to find the annotations. --> <!-- FIXME: THIS STEP IS A HACK!!!! --> <step id="[% index %]" active="0" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/> <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/> <arg id="3" name="" value="> data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/> </processor_args> </step> [% index = index + 1 %] <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. --> <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/> <item id="list_of_phenotypes" value="[% profiles_to_count %]"/> <item id="ppla_input_stash" value="complete_ppla_input"/> <item id="profiles_stash" value="stable_ppla_output_profiles"/> <item id="subdir" value="[% dir %]"/> <item id="total_profiles_to_use" value="[% num %]"/> <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/> </input> <output> <item id="stash_output" value="scores_for_samples"/> <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_summary"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples"/> <item id="subdir" value="[% dir %]"/> <!-- FIXME: this template includes a hard-coded dimension!! --> <item id="sample_number" value="16"/> <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/> </input> <output> <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/> <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="0" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> <arg id="2" name="" value="< data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/> </processor_args> </step> [% index = index + 1 %] [% END %] <!-- THE SAME BUT RANDOMIZED! --> <!-- The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples, score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results will be summarized as correct or not and the overall predictive process will be scored. --> <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> [% index = index + 1 %] <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> <item id="randomized" value="1"/> </input> <output> <item id="stash_output" value="complete_ppla_input_randomized"/> </output> </step> [% index = index + 1 %] <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. --> [% FOREACH dir = dirs %] [% index = index + 1 %] <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation. The problem is what annotation to use? In the brain tumor data it was simple: HC_1A...etc. Here I want to use dlda phenotype but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored) I wrote a quick script below to append the annotation onto the file. It should only be used when data is read for non-glioma datasets. This uses the data from SifFileParser to find the annotations. --> <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. --> <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/> <item id="list_of_phenotypes" value="[% profiles_to_count %]"/> <item id="ppla_input_stash" value="complete_ppla_input_randomized"/> <item id="profiles_stash" value="stable_ppla_output_profiles"/> <item id="subdir" value="[% dir %]"/> <item id="total_profiles_to_use" value="[% num %]"/> <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/> </input> <output> <item id="stash_output" value="scores_for_samples_randomized"/> <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_randomized_summary"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples_randomized"/> <item id="subdir" value="[% dir %]"/> <!-- FIXME: this template includes a hard-coded dimension!! --> <item id="sample_number" value="16"/> <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/> </input> <output> <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/> <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="1" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> <arg id="2" name="" value="< data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/> </processor_args> </step> [% index = index + 1 %] [% END %] Index: make_profiles_and_run_la_include.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/make_profiles_and_run_la_include.xml,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** make_profiles_and_run_la_include.xml 17 Feb 2006 00:09:07 -0000 1.5 --- make_profiles_and_run_la_include.xml 22 Feb 2006 09:12:23 -0000 1.6 *************** *** 2,6 **** [% index = index + 1 %] <!-- parses the SIF file to generate a hash of file names and their HC --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser"> <input> <item id="sif_file" value="[% sif_file %]"/> --- 2,6 ---- [% index = index + 1 %] <!-- parses the SIF file to generate a hash of file names and their HC --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser"> <input> <item id="sif_file" value="[% sif_file %]"/> *************** *** 51,54 **** --- 51,55 ---- <item id="end" value="[% j+profile_block_size %]"/> <item id="pre_cache_mas5" value="1"/> + <item id="no_overwrite" value="1"/> </input> <output> *************** *** 58,63 **** </step> [% index = index + 1 %] - [% j = j+profile_block_size %] [% END %] [% END %] --- 59,64 ---- </step> [% index = index + 1 %] [% END %] + [% j = j+profile_block_size %] [% END %] *************** *** 65,68 **** --- 66,70 ---- [% index = index + 1 %] <!-- the next two steps just read all the profiles --> + <!-- FIXME: LEFT OFF HERE, this dataset should include none of the testing samples!! --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles"> <input> *************** *** 102,110 **** [% WHILE j < total_number_profiles %] <!-- execution_type="cluster" --> ! <step id="[% index %].[% j %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster"> <input> <item id="entropy_filter" value="3"/> ! <item id="individual_u_max" value="0.4"/> ! <item id="together_u_min" value="0.6"/> <item id="number_profiles" value="[% num_profiles %]"/> <item id="lowA" value="-1"/> --- 104,112 ---- [% WHILE j < total_number_profiles %] <!-- execution_type="cluster" --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner"> <input> <item id="entropy_filter" value="3"/> ! <item id="individual_u_max" value="0.3"/> ! <item id="together_u_min" value="0.5"/> <item id="number_profiles" value="[% num_profiles %]"/> <item id="lowA" value="-1"/> *************** *** 115,118 **** --- 117,121 ---- <item id="start" value="[% j %]"/> <item id="end" value="[% j+profile_block_size %]"/> + <item id="no_overwrite" value="1"/> <!-- item id="profiles_sub_dirs" value="75,90"/ --> </input> *************** *** 122,128 **** </step> [% j = j+ppla_block_size %] - [% END %] [% index = index + 1 %] [% END %] [% index = index + 1 %] --- 125,153 ---- </step> [% j = j+ppla_block_size %] [% index = index + 1 %] [% END %] + [% END %] + + [% index = index + 1 %] + <!-- This runs too slow! --> + <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner"> + <input> + <item id="entropy_filter" value="3"/> + <item id="individual_u_max" value="0.3"/> + <item id="together_u_min" value="0.5"/> + <item id="number_profiles" value="[% num_profiles %]"/> + <item id="lowA" value="-1"/> + <item id="highA" value="-1"/> + <item id="ppla_bin" value="bin/PPLA-1.1-255"/> + <item id="profiles_dir" value="data/[% datadir %]/profiles"/> + <item id="profiles_sub_dirs" value="100"/> + <item id="start" value="0"/> + <item id="end" value="2"/> + </input> + <output> + <item id="output_dir" value="data/[% datadir %]/ppla_output"/> + </output> + </step> + [% index = index + 1 %] *************** *** 135,144 **** https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108 --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput"> <input> <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/> <item id="profile_output_sub_dirs" value="[% dir_str %]"/> <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> ! <item id="profiles_to_count" value="HC_1A,HC_1B,HC_2A,HC_2B,grade_3,grade_4,sex_f,sex_m,survial_time_group_36,survial_time_group_37,survial_time_group_38,survial_time_group_43,survial_time_group_53,survial_time_group_54,survial_time_group_57,survial_time_group_31,survial_time_group_32,tumor_type_astro,tumor_type_gbm,tumor_type_mixed,tumor_type_oligo,survival_cluster_1,survival_cluster_2"/> </input> <output> --- 160,170 ---- https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108 --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput"> <input> <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/> <item id="profile_output_sub_dirs" value="[% dir_str %]"/> <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> ! <item id="profiles_to_count" value="[% profiles_to_count %]"/> ! <item id="compare_to_reference" value="[% compare_to_100_percent_reference %]"/> </input> <output> Index: original_glioma_classification_with_profiles.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_profiles.xml,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** original_glioma_classification_with_profiles.xml 18 Jan 2006 01:15:26 -0000 1.2 --- original_glioma_classification_with_profiles.xml 22 Feb 2006 09:12:23 -0000 1.3 *************** *** 8,12 **** --> <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> ! <step id="43.1" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> --- 8,13 ---- --> <!-- FIXME: all PPLA input files must contain /^sample/ on the first row --> ! [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile"> <input> <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> *************** *** 16,20 **** </output> </step> ! <step id="43.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> --- 17,22 ---- </output> </step> ! [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> *************** *** 27,44 **** </step> <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. --> - [% i = 1 %] [% FOREACH dir = dirs %] <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> ! <step id="44.1[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. --> ! <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/test_set.txt"/> ! <item id="list_of_phenotypes" value="HC_1A,HC_2A,HC_1B,HC_2B"/> <item id="ppla_input_stash" value="complete_ppla_input"/> <item id="profiles_stash" value="stable_ppla_output_profiles"/> <item id="subdir" value="[% dir %]"/> <item id="total_profiles_to_use" value="[% num %]"/> ! <item id="profile_count_cutoffs" value="5,10,15,20,30,40,50,60,70,80,90,100,all"/> </input> <output> --- 29,62 ---- </step> <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. --> [% FOREACH dir = dirs %] + [% index = index + 1 %] + <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation. The problem is + what annotation to use? In the brain tumor data it was simple: HC_1A...etc. Here I want to use dlda phenotype + but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored) + I wrote a quick script below to append the annotation onto the file. It should only be used when data is read + for non-glioma datasets. + This uses the data from SifFileParser to find the annotations. + --> + <!-- FIXME: THIS STEP IS A HACK!!!! --> + <step id="[% index %]" active="1" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl"> + <processor_args> + <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/> + <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/> + <arg id="3" name="" value="> data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/> + </processor_args> + </step> + [% index = index + 1 %] <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles"> <input> <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets --> <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. --> ! <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/> ! <item id="list_of_phenotypes" value="[% profiles_to_count %]"/> <item id="ppla_input_stash" value="complete_ppla_input"/> <item id="profiles_stash" value="stable_ppla_output_profiles"/> <item id="subdir" value="[% dir %]"/> <item id="total_profiles_to_use" value="[% num %]"/> ! <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/> </input> <output> *************** *** 47,70 **** </output> </step> ! <step id="44.2[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples"/> <item id="subdir" value="[% dir %]"/> <!-- FIXME: this template includes a hard-coded dimension!! --> ! <item id="sample_number" value="12"/> <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/> </input> <output> ! <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/> <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/> </output> </step> ! <step id="44.3[% i %]" active="0" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> ! <arg id="2" name="" value="< data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/> </processor_args> </step> ! [% i = i+1 %] [% END %] --- 65,90 ---- </output> </step> ! [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="scores_for_samples"/> <item id="subdir" value="[% dir %]"/> <!-- FIXME: this template includes a hard-coded dimension!! --> ! <item id="sample_number" value="16"/> <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/> </input> <output> ! <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/> <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/> </output> </step> ! [% index = index + 1 %] ! <step id="[% index %]" active="1" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> ! <arg id="2" name="" value="< data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/> </processor_args> </step> ! [% index = index + 1 %] [% END %] --- NEW FILE: classification_with_vgl.xml --- <!-- Now repeat the whole process, this time use the VGL to classify samples --> <!-- This step uses the output from ReadVGLOutput --> <!-- FIXME: note hardcoded subdir here --> <!-- FIXME: the next three steps don't seem to work. Somewhere the categories seems to be crossed!? --> [% index = index + 1 %] [% FOREACH dir = dirs %] <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL"> <input> <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% testset_w_annotations %]"/> <item id="list_of_phenotypes" value="[% profiles_to_count %]"/> <item id="vgl_input" value="data/[% datadir %]/vgl_output/parsed_output.storable"/> <item id="subdir" value="[% dir %]"/> <!-- item id="profile_count_cutoffs" value="5"/ --> <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/> <item id="exp_values" value="[% exp_values %]"/> </input> <output> <item id="stash_output" value="vgl_scores_for_samples"/> <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_percent_vgl_based_classification_summary"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification"> <input> <item id="stash_input" value="vgl_scores_for_samples"/> <item id="subdir" value="[% dir %]"/> <item id="sample_number" value="16"/> <!-- this is the number of profile_count_cutoffs (w/o 'all') --> <!-- FIXME: this template includes a hard-coded dimension!! --> <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/> </input> <output> <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/> <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="1" type="shell_command" processor="R"> <processor_args> <arg id="1" name="" value="--vanilla"/> <arg id="2" name="" value="< data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/> </processor_args> </step> [% index = index + 1 %] [% END %] --- NEW FILE: p53_breast_cancer_data_Miller_et_al_2005.xml --- <!-- Variables that are used throughout --> [% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %] [% cutoff_for_stability_percent = 10 %] [% dirs = ['75'] %] [% dir_str = '75' %] [% percent_to_hold_for_testset = 35 %] [% num_profiles = 14 %] [% total_number_profiles = 100 %] [% times_to_repeat = 100 %] [% profile_block_size = 10 %] [% ppla_block_size = 10 %] [% index = 0 %] <project project_name="Project_Logic_Analysis" project_description="This project looks at understanding gene relationships using the logic analysis technique created by P. Bowers and T. Yeates. I've extended the technique to use microarray data." db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password=""> <pipe pipe_name="Logic_Analysis_Network_Stability_With_p53_Breast_Cancer_Data_Miller_et_al_Pipe" pipe_desc="Tests the stability of networks using the p53 breast cancer dataset by Miller et al." pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis" testing_mode="1"> <settings> <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/> <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/> <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner" version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis" tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/> <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher" publish_root_dir="/raid5a/boconnor/public_html/Projects" publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/> </settings> <initialization> <plugin name="Logger"/> <plugin name="SystemStateRecorder"/> <plugin name="Versioner"/> </initialization> <run> [%# this is an example of a comment %] <!-- Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108 --> [% sif_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_sif.txt" %] [% file_map_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_map.txt" %] [% phenotypes = "NA" %] [% col_ordering = "NA" %] [% sif_format = "geo" %] [% random_selection_technique = "across_all_samples" %] [% parse_old_mas5 = 0 %] [% profiles_to_count = "grade_1,grade_2,grade_3,lymph_pos,lymph_neg,er_pos,er_neg,pgr_pos,pgr_neg,dlda_wt,dlda_mt,p53_wt,p53_mt" %] [% compare_to_reference = "0" %] [% INCLUDE make_profiles_and_run_la_include.xml %] <!-- Code that parses the Voting Gene List output --> <!-- LEFT OFF HERE --> <!-- The ReadVGLOutput module needs to be reworked to remove reference to 100% dataset and also to parse output correctly --> [% index = index + 100 %] [% base_col = 104 %] [% input_file_name = "dChipExpr_BreastCancer_groupTtest.xls" %] [% pheno_str_1 = "DLDA_WT" %] [% pheno_str_2 = "DLDA_MT" %] [%# INCLUDE vgl_parsing_pipe.xml %] <!-- Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of profiles and graph them out with graphviz. --> <!-- FIXME: It doesn't always find the profile in the reference set. Need to fix this!!! --> <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set --> <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with --> [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles"> <input> <item id="parsed_output_stashname" value="summary_of_ppla_output"/> <item id="parsed_output_filename" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="profile_sub_dirs" value="[% dir_str %]"/> <item id="cutoff" value="[% cutoff_for_stability_percent %]"/> <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used --> </input> <output> <item id="output_dir" value="data/[% datadir %]/top_profiles"/> </output> </step> <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest --> <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/> <arg id="4" name="" value="> data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/> </processor_args> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/> </output> </step> [% index = index + 1 %] [% END %] <!-- This section calls Peter's code to calculate p-values based on hypergeometric dist. It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the visualizer to create an input to Peter's hypergeometric calculation. --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles --> <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/> <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/> <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/> <arg id="4" name="" value="> data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> </processor_args> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/> </output> </step> [% index = index + 1 %] <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> <arg id="2" name="" value="> data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/> </processor_args> </step> <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value) It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value" entry that stores the various p-value calculations done by Peter. This is used by FindMostConnectedNodes and visualiseTriplets.pl to annotate the results. This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable --> <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable --> <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the visualiseTriplet.pl output.--> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput"> <input> <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/> <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/> <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="subdir" value="[% dir %]"/> </input> <output> <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/> <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> </output> </step> [% index = index + 1 %] [% END %] <!-- now parse out the top profiles and collect some statistics on them --> <!-- FIXME: this is redundant with what's below! --> [% index = index + 1 %] <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles"> <input> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> <item id="subdirs" value="[% dir_str %]"/> <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/> </input> <output> <item id="stash_output" value="stable_ppla_output_profiles"/> <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/> </output> </step> <!-- This step goes through the PPLA output parsed above and counts how many times a given probeset is included in a triplet relationship. It then summarizes this information into a hash and hands off the display of the information to a tt2. The output is an HTML document that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset. This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question to visualize the individual networks with html and png output. --> <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships --> <!-- FIXME: subdir is currently hardcoded inside this script!! --> <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module --> [% index = index + 1 %] [% FOREACH dir = dirs %] <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" --> <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes"> <input> <item id="stash_input" value="stable_ppla_output_profiles"/> <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/> <item id="subdir" value="[% dir%]"/> <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/> <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/> <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/> <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/> <item id="min_triplets" value="3"/> <item id="template_most_connected" value="index_for_connected_nodes.tt2"/> <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/> <item id="template_most_stable" value="index_for_connected_nodes.tt2"/> <item id="template_detailed" value="details_for_connected_nodes.tt2"/> <item id="template_dir" value="data/[% datadir %]/analysis/templates"/> <item id="profiles_to_count" value="[% profiles_to_count %]"/> </input> <output> <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/> </output> </step> [% index = index + 1 %] [% END %] <!-- Just creates a summary page at http://humerus/project_logic_analysis --> [% index = index + 1 %] <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl"> <processor_args> <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/> <arg id="2" name="" value="> /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/> </processor_args> </step> <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than the generic annotation bias checker below --> <!-- FIXME: includes some hardcoded elements --> [%# INCLUDE oncogene_counts.xml %] <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time! The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each probeset. The outp... [truncated message content] |
|
From: <all...@su...> - 2006-02-17 02:15:50
|
Update of /cvsroot/libnelson/Pg/celsius/bin In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8931/bin Modified Files: gecIDsync Log Message: now loads annotations to annot.allenday_gec Index: gecIDsync =================================================================== RCS file: /cvsroot/libnelson/Pg/celsius/bin/gecIDsync,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** gecIDsync 31 Jan 2006 01:14:42 -0000 1.1 --- gecIDsync 17 Feb 2006 02:15:47 -0000 1.2 *************** *** 9,13 **** my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','',''); ! $dbh->do('SET search_path TO cel, public'); my $select1_sth = $dbh->prepare('SELECT cel_id FROM cel_dbxref, dbxref WHERE cel_dbxref.dbxref_id = dbxref.dbxref_id AND dbxref.accession LIKE ?'); --- 9,13 ---- my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','',''); ! $dbh->do('SET search_path TO cel, annot, public'); my $select1_sth = $dbh->prepare('SELECT cel_id FROM cel_dbxref, dbxref WHERE cel_dbxref.dbxref_id = dbxref.dbxref_id AND dbxref.accession LIKE ?'); *************** *** 16,19 **** --- 16,20 ---- my $insert1_sth = $dbh->prepare('INSERT INTO dbxref (db_id, accession) VALUES ((SELECT db_id FROM db WHERE name = ?),?)'); my $insert2_sth = $dbh->prepare('INSERT INTO cel_dbxref (cel_id, dbxref_id) VALUES (?,?)'); + my $insert3_sth = $dbh->prepare('INSERT INTO annot.allenday_gec (biomaterial_id, type_id) VALUES (?,(SELECT c.cvterm_id FROM cvterm AS c, dbxref AS x, db AS d WHERE c.dbxref_id = x.dbxref_id AND x.db_id = d.db_id AND x.accession = ? AND d.name = ?))'); my @files = $dom->getElementsByTagName('file'); *************** *** 41,44 **** --- 42,52 ---- ( $x ) = $select2_sth->fetchrow_array(); $insert2_sth->execute( $c, $x ); + + my @annots = $file->getElementsByTagName('annotation'); + foreach my $annot ( @annots ) { + my $accession = $annot->getAttribute( 'accession' ); + my ( $db, $acc ) = $accession =~ m/^(.+?):(.+?)$/; + $insert3_sth->execute( $c, $acc, $db ); + } } |
|
From: <boc...@su...> - 2006-02-17 01:57:17
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7343/scripts Modified Files: extract_probesets.sh make_mas5.R Added Files: vgl_input_script.pl Log Message: Added quick script to make VGL sif file for Marc Index: make_mas5.R =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/scripts/make_mas5.R,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** make_mas5.R 11 Aug 2005 23:13:11 -0000 1.2 --- make_mas5.R 17 Feb 2006 01:57:10 -0000 1.3 *************** *** 3,7 **** library(affy); ! file_list = list.files(path="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis/data/original_glioma_dataset/cel",pattern=".CEL",full.names=TRUE); for (file_name in file_list) --- 3,7 ---- library(affy); ! file_list = list.files(path="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis/data/t-cell_leukemia_data_Soulier_et_al_2005/cel",pattern=".CEL",full.names=TRUE); for (file_name in file_list) Index: extract_probesets.sh =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/scripts/extract_probesets.sh,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** extract_probesets.sh 3 Feb 2006 04:39:30 -0000 1.1 --- extract_probesets.sh 17 Feb 2006 01:57:10 -0000 1.2 *************** *** 1,9 **** #!/bin/bash ! i=0 while [ $i -le $1 ] do ! echo "cat $2 | grep -P '^5' | grep HC_2B | awk '{ print $3 }' | sort | uniq > $3/$i.txt" ! cat $2 | grep -P '^5' | grep HC_2B | awk '{ print $3 }' | sort | uniq > $3/$i.txt i=$((i+1)) done --- 1,20 ---- #!/bin/bash ! # non-random list ! ! echo "Extracting Non-Random List" ! echo "cat $3 | grep -P '^5' | grep $2 | awk '{ print $3 }' | sort | uniq > $5/non_random.txt" ! cat $3 | grep -P '^5' | grep $2 | awk '{ print $3 }' | sort | uniq > $5/non_random.txt ! ! # random list ! echo "cat $4 | grep -P '^\d+' | awk '{ print $1 }' | sort | uniq > /tmp/all_probesets.txt" ! cat $4 | grep -P '^\d+' | awk '{ print $1 }' | sort | uniq > /tmp/all_probesets.txt ! ! i=1 while [ $i -le $1 ] do ! echo "Extracting Random List $i" ! echo "cat /tmp/all_probesets.txt | perl scripts/select_probesets_randomly.pl `wc $5/non_random.txt | awk '{ print $1 }'` > $5/random_$i.txt" ! cat /tmp/all_probesets.txt | perl scripts/select_probesets_randomly.pl `wc $5/non_random.txt | awk '{ print $1 }'` > $5/random_$i.txt i=$((i+1)) done --- NEW FILE: vgl_input_script.pl --- #!/usr/local/bin/perl # to make vgl hardcoded and ugly use Storable; my $data = retrieve("data/p53_breast_cancer_data_Miller_et_al_2005/sif_hash.storable"); foreach my $file (sort keys %{$data->{'files'}}) { my $dlda_mt = $data->{'files'}{$file}{'dlda_mt'}; my $dlda_txt = "DLDA_WT"; if ($dlda_mt) { $dlda_txt = "DLDA_MT"; } print "$file\t$file\t$dlda_txt\n"; } |
|
From: <all...@su...> - 2006-02-17 01:45:39
|
Update of /cvsroot/libnelson/java In directory sumo.genetics.ucla.edu:/tmp/cvs-serv6451 Modified Files: gecCel.java Log Message: now exports annotations as well Index: gecCel.java =================================================================== RCS file: /cvsroot/libnelson/java/gecCel.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** gecCel.java 31 Jan 2006 00:50:02 -0000 1.1 --- gecCel.java 17 Feb 2006 01:45:32 -0000 1.2 *************** *** 19,40 **** String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'"; Connection conn = null; ! PreparedStatement pstmt = null; ! ResultSet rs = null; ! try { conn = getConnection(); ! pstmt = conn.prepareStatement(SEL_EXP_RUN); ! rs = pstmt.executeQuery(); System.out.println("<files>"); ! while(rs.next()) { ! int chip_id = rs.getInt("CHIP_ID"); ! String file_id = rs.getString("FILE_ID"); ! int sample_id = rs.getInt("SAMPLE_ID"); ! System.out.println(" <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\"/>"); } System.out.println("</files>"); ! cleanup(conn, pstmt, rs); } catch(Exception e){ --- 19,68 ---- String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'"; + // String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref! _id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?"; + Connection conn = null; ! PreparedStatement stmt1 = null; ! PreparedStatement stmt2 = null; ! ResultSet rs1 = null; ! ResultSet rs2 = null; ! try { conn = getConnection(); ! stmt1 = conn.prepareStatement(SEL_EXP_RUN); ! rs1 = stmt1.executeQuery(); System.out.println("<files>"); ! while(rs1.next()) { ! int chip_id = rs1.getInt("CHIP_ID"); ! String file_id = rs1.getString("FILE_ID"); ! int sample_id = rs1.getInt("SAMPLE_ID"); ! ! System.out.println(" <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\">"); ! ! stmt2 = conn.prepareStatement( ! "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+ ! "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = "+sample_id ! ); ! ! // rs2 = stmt2.executeQuery( sample_id, sample_id, sample_id, sample_id, sample_id, sample_id, sample_id ); ! rs2 = stmt2.executeQuery(); ! ! while(rs2.next()) { ! String accession = rs2.getString("ACCESSION"); ! System.out.println(" <annotation accession=\""+ accession +"\"/>"); ! } + rs2.close(); + rs2 = null; + stmt2.close(); + + System.out.println(" </file>"); } System.out.println("</files>"); ! cleanup(conn, stmt1, rs1); } catch(Exception e){ *************** *** 53,65 **** } ! private static void cleanup(Connection conn, PreparedStatement pstmt, ResultSet rs) throws SQLException { ! if (rs != null) { ! rs.close(); ! rs = null; } ! if (pstmt != null) { ! pstmt.close(); ! pstmt = null; } if (conn != null) { conn.close(); --- 81,97 ---- } ! private static void cleanup(Connection conn, PreparedStatement stmt1, ResultSet rs1) throws SQLException { ! if (rs1 != null) { ! rs1.close(); ! rs1 = null; } ! if (stmt1 != null) { ! stmt1.close(); ! stmt1 = null; } + // if (stmt2 != null) { + // stmt2.close(); + // stmt2 = null; + // } if (conn != null) { conn.close(); |
|
From: <boc...@su...> - 2006-02-17 00:09:31
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv31102/lib/perl/Nelson/Pipe/Container/Job Modified Files: CreateProfiles.pm SifFileParser.pm Log Message: Changes to libs Index: SifFileParser.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SifFileParser.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** SifFileParser.pm 16 Feb 2006 00:31:19 -0000 1.8 --- SifFileParser.pm 17 Feb 2006 00:09:22 -0000 1.9 *************** *** 167,187 **** my $grade = $tokens[4]; ! # ER status (ER+=mutant; ER-=wt) ! my $er_wt = 1; ! if ($tokens[5] eq 'ER+') { $er_wt = 0; } # PgR status (PgR+=mutant; PgR-=wt) ! my $pgr_wt = 1; ! if ($tokens[6] eq 'PgR+') { $pgr_wt = 0; } # Lymph node status ! my $lymph_pos = 1; ! if ($tokens[9] eq 'LN-') { $lymph_pos = 0; } $self->{files}{$filename}{p53_wt} = $p53_wt; $self->{files}{$filename}{dlda_mt} = $dlda_mt; $self->{files}{$filename}{grade} = $grade; ! $self->{files}{$filename}{er_wt} = $er_wt; ! $self->{files}{$filename}{pgr_wt} = $pgr_wt; $self->{files}{$filename}{lymph_pos} = $lymph_pos; } --- 167,187 ---- my $grade = $tokens[4]; ! # ER status ! my $er_pos = 0; ! if ($tokens[5] eq 'ER+') { $er_pos = 1; } # PgR status (PgR+=mutant; PgR-=wt) ! my $pgr_pos = 0; ! if ($tokens[6] eq 'PgR+') { $pgr_pos = 1; } # Lymph node status ! my $lymph_pos = 0; ! if ($tokens[9] eq 'LN+') { $lymph_pos = 1; } $self->{files}{$filename}{p53_wt} = $p53_wt; $self->{files}{$filename}{dlda_mt} = $dlda_mt; $self->{files}{$filename}{grade} = $grade; ! $self->{files}{$filename}{er_pos} = $er_pos; ! $self->{files}{$filename}{pgr_pos} = $pgr_pos; $self->{files}{$filename}{lymph_pos} = $lymph_pos; } Index: CreateProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** CreateProfiles.pm 16 Feb 2006 20:44:14 -0000 1.14 --- CreateProfiles.pm 17 Feb 2006 00:09:22 -0000 1.15 *************** *** 245,248 **** --- 245,253 ---- $results->{samples}{annotations}{grade_3}{$filename} = 1; $results->{samples}{annotations}{grade_4}{$filename} = 0; + } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G\?/) { + $results->{samples}{annotations}{grade_1}{$filename} = 0; + $results->{samples}{annotations}{grade_2}{$filename} = 0; + $results->{samples}{annotations}{grade_3}{$filename} = 0; + $results->{samples}{annotations}{grade_4}{$filename} = 0; } *************** *** 256,275 **** } ! # er_wt ! if ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /1/) { ! $results->{samples}{annotations}{er_wt}{$filename} = 1; ! $results->{samples}{annotations}{er_mt}{$filename} = 0; ! } elsif ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /0/) { ! $results->{samples}{annotations}{er_wt}{$filename} = 0; ! $results->{samples}{annotations}{er_mt}{$filename} = 1; } ! # pgr_wt ! if ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /1/) { ! $results->{samples}{annotations}{pgr_wt}{$filename} = 1; ! $results->{samples}{annotations}{pgr_mt}{$filename} = 0; ! } elsif ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /0/) { ! $results->{samples}{annotations}{pgr_wt}{$filename} = 0; ! $results->{samples}{annotations}{pgr_mt}{$filename} = 1; } --- 261,280 ---- } ! # er_pos ! if ($sif_hash->{'files'}{$old_filename}{er_pos} =~ /1/) { ! $results->{samples}{annotations}{er_pos}{$filename} = 1; ! $results->{samples}{annotations}{er_neg}{$filename} = 0; ! } elsif ($sif_hash->{'files'}{$old_filename}{er_pos} =~ /0/) { ! $results->{samples}{annotations}{er_pos}{$filename} = 0; ! $results->{samples}{annotations}{er_neg}{$filename} = 1; } ! # pgr_pos ! if ($sif_hash->{'files'}{$old_filename}{pgr_pos} =~ /1/) { ! $results->{samples}{annotations}{pgr_pos}{$filename} = 1; ! $results->{samples}{annotations}{pgr_neg}{$filename} = 0; ! } elsif ($sif_hash->{'files'}{$old_filename}{pgr_pos} =~ /0/) { ! $results->{samples}{annotations}{pgr_pos}{$filename} = 0; ! $results->{samples}{annotations}{pgr_neg}{$filename} = 1; } |
|
From: <boc...@su...> - 2006-02-17 00:09:15
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/conf In directory sumo.genetics.ucla.edu:/tmp/cvs-serv31072/conf Modified Files: make_profiles_and_run_la_include.xml Log Message: Changes to conf files Index: make_profiles_and_run_la_include.xml =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/make_profiles_and_run_la_include.xml,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** make_profiles_and_run_la_include.xml 16 Feb 2006 08:19:58 -0000 1.4 --- make_profiles_and_run_la_include.xml 17 Feb 2006 00:09:07 -0000 1.5 *************** *** 2,6 **** [% index = index + 1 %] <!-- parses the SIF file to generate a hash of file names and their HC --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser"> <input> <item id="sif_file" value="[% sif_file %]"/> --- 2,6 ---- [% index = index + 1 %] <!-- parses the SIF file to generate a hash of file names and their HC --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser"> <input> <item id="sif_file" value="[% sif_file %]"/> *************** *** 19,23 **** <!-- reads the hash created by the SIF parser and randomly selects files for 10-90% of the samples --> <!-- FIXME: this creates rounding errors in which the number of files selected is less than it should be! --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles"> <input> <item id="sif_file_hash" value="sif_file_hash"/> --- 19,23 ---- <!-- reads the hash created by the SIF parser and randomly selects files for 10-90% of the samples --> <!-- FIXME: this creates rounding errors in which the number of files selected is less than it should be! --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles"> <input> <item id="sif_file_hash" value="sif_file_hash"/> *************** *** 39,44 **** [% WHILE j < total_number_profiles %] [% FOREACH dir = dirs %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles"> ! <!-- step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles" execution_type="cluster" --> <input> <item id="file_list_dir" value="data/[% datadir %]/rand_file_lists"/> --- 39,44 ---- [% WHILE j < total_number_profiles %] [% FOREACH dir = dirs %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles"> ! <!-- step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles" execution_type="cluster" --> <input> <item id="file_list_dir" value="data/[% datadir %]/rand_file_lists"/> *************** *** 62,68 **** [% END %] [% index = index + 1 %] <!-- the next two steps just read all the profiles --> ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles"> <input> <item id="sif_file_hash" value="sif_file_hash"/> --- 62,69 ---- [% END %] + <!-- large datasets will take a long time to run and need to use the special version of PPLA --> [% index = index + 1 %] <!-- the next two steps just read all the profiles --> ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles"> <input> <item id="sif_file_hash" value="sif_file_hash"/> *************** *** 76,80 **** </step> [% index = index + 1 %] ! <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles"> <input> <item id="file_list_dir" value="data/[% datadir %]/complete_file_list"/> --- 77,81 ---- </step> [% index = index + 1 %] ! <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles"> <input> <item id="file_list_dir" value="data/[% datadir %]/complete_file_list"/> *************** *** 83,87 **** <item id="sif_file_hash" value="sif_file_hash"/> <item id="sif_file_hash_storable" value="data/[% datadir %]/sif_hash.storable"/> ! <item id="parse_old_mas5" value="1"/> <item id="start" value="0"/> <item id="end" value="1"/> --- 84,88 ---- <item id="sif_file_hash" value="sif_file_hash"/> <item id="sif_file_hash_storable" value="data/[% datadir %]/sif_hash.storable"/> ! <item id="parse_old_mas5" value="[% parse_old_mas5 %]"/> <item id="start" value="0"/> <item id="end" value="1"/> *************** *** 101,105 **** [% WHILE j < total_number_profiles %] <!-- execution_type="cluster" --> ! <step id="[% index %].[% j %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster"> <input> <item id="entropy_filter" value="3"/> --- 102,106 ---- [% WHILE j < total_number_profiles %] <!-- execution_type="cluster" --> ! <step id="[% index %].[% j %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster"> <input> <item id="entropy_filter" value="3"/> |
|
From: <boc...@su...> - 2006-02-16 20:44:25
|
Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job In directory sumo.genetics.ucla.edu:/tmp/cvs-serv14105/lib/perl/Nelson/Pipe/Container/Job Modified Files: CreateProfiles.pm RandomlySelectFiles.pm Log Message: Changed the create profile script Index: RandomlySelectFiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/RandomlySelectFiles.pm,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** RandomlySelectFiles.pm 16 Feb 2006 07:37:02 -0000 1.8 --- RandomlySelectFiles.pm 16 Feb 2006 20:44:14 -0000 1.9 *************** *** 68,73 **** } elsif ($random_selection_technique eq 'across_all_samples') { ! ! open TESTSET, ">$rand_file_list_dir/test_set_$percent_to_rand_select.txt" or die; my @files = shuffle keys(%{$stash->{$sif_file_hash}{'files'}}); my $size = int(0.01 * $percent_to_hold_for_testset * scalar(@files)); --- 68,73 ---- } elsif ($random_selection_technique eq 'across_all_samples') { ! system ("mkdir -p $rand_file_list_dir"); ! open TESTSET, ">$rand_file_list_dir/test_set_$percent_to_rand_select.txt" or die "Can't open $rand_file_list_dir/test_set_$percent_to_rand_select.txt\n"; my @files = shuffle keys(%{$stash->{$sif_file_hash}{'files'}}); my $size = int(0.01 * $percent_to_hold_for_testset * scalar(@files)); Index: CreateProfiles.pm =================================================================== RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** CreateProfiles.pm 16 Feb 2006 08:20:06 -0000 1.13 --- CreateProfiles.pm 16 Feb 2006 20:44:14 -0000 1.14 *************** *** 39,43 **** $self->{mas5_cache} = {}; if ($pre_cache_mas5 eq '1') { ! $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir); } --- 39,44 ---- $self->{mas5_cache} = {}; if ($pre_cache_mas5 eq '1') { ! if (-f $mas5_cache_output) { $self->{mas5_cache} = retrieve($mas5_cache_output); } ! else { $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir); } } |