nelsonlab-cmts Mailing List for nelsonlab

Brought to you by: allenday, boconnor, bret_harry, jmendler, and 2 others

nelsonlab-cmts — cvs commits

You can subscribe to this list here.

2006	_Jan	_Feb (38)	_Mar (5)	_Apr	_May	_Jun	_Jul	_Aug	_Sep	_Oct	_Nov	_Dec

Flat | Threaded

1 2 > >> (Page 1 of 2)

[Nelsonlab-cmts] libnelson/Pg/celsius/bin celsius-extract,1.5,1.6

From: <all...@su...> - 2006-03-14 18:19:07

Update of /cvsroot/libnelson/Pg/celsius/bin
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8582/bin

Modified Files:
	celsius-extract 
Log Message:
updates for marc


Index: celsius-extract
===================================================================
RCS file: /cvsroot/libnelson/Pg/celsius/bin/celsius-extract,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** celsius-extract	16 Feb 2006 02:49:35 -0000	1.5
--- celsius-extract	14 Mar 2006 18:18:39 -0000	1.6
***************
*** 35,39 ****
  $sn =~ s/^SN://;
  
! my $dbh1 = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','','');
  
  if ( $method eq 'cel' ) {
--- 35,39 ----
  $sn =~ s/^SN://;
  
! my $dbh1 = DBI->connect('dbi:Pg:dbname=chado-celsius;host=torso.genomics.ctrl.ucla.edu');
  
  if ( $method eq 'cel' ) {
***************
*** 56,67 ****
  elsif ( $valid_method{ $method } ) {
    #Brian may want the mas5.call and mas5.p
    my $sth = $dbh1->prepare(qq(
    SET search_path TO cel, part_elementresult, public;
  
!   SELECT d1.name || x1.accession AS accession, x2.accession AS probeset, $method.signal
!   FROM part_elementresult.$method, cel, cel_dbxref AS cx, element, dbxref AS x1, dbxref AS x2, db AS d1, quantification AS q
!   WHERE $method.quantification_id = q.quantification_id
      AND q.acquisition_id = cel.cel_id
!     AND $method.element_id = element.element_id
      AND element.dbxref_id = x2.dbxref_id
      AND cel.cel_id = cx.cel_id
--- 56,68 ----
  elsif ( $valid_method{ $method } ) {
    #Brian may want the mas5.call and mas5.p
+ 
    my $sth = $dbh1->prepare(qq(
    SET search_path TO cel, part_elementresult, public;
  
!   SELECT d1.name || x1.accession AS accession, x2.accession AS probeset, r.signal
!   FROM part_elementresult.${method}_byq AS r, cel, cel_dbxref AS cx, element, dbxref AS x1, dbxref AS x2, db AS d1, quantification AS q
!   WHERE r.quantification_id = q.quantification_id
      AND q.acquisition_id = cel.cel_id
!     AND r.element_id = element.element_id
      AND element.dbxref_id = x2.dbxref_id
      AND cel.cel_id = cx.cel_id

[Nelsonlab-cmts] Geo-Google/lib/Geo Google.pm,1.3,1.4

From: <all...@su...> - 2006-03-13 23:22:58

Update of /cvsroot/Geo-Google/lib/Geo
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv19028/lib/Geo

Modified Files:
	Google.pm 
Log Message:
version bump for release


Index: Google.pm
===================================================================
RCS file: /cvsroot/Geo-Google/lib/Geo/Google.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** Google.pm	13 Mar 2006 23:22:24 -0000	1.3
--- Google.pm	13 Mar 2006 23:22:53 -0000	1.4
***************
*** 122,126 ****
  use strict;
  use warnings;
! our $VERSION = '0.01';
  
  #this gets a javascript page containing map XML
--- 122,126 ----
  use strict;
  use warnings;
! our $VERSION = '0.02';
  
  #this gets a javascript page containing map XML

[Nelsonlab-cmts] Geo-Google/lib/Geo/Google Location.pm,1.3,1.4

From: <all...@su...> - 2006-03-13 23:22:29

Update of /cvsroot/Geo-Google/lib/Geo/Google
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv18966/lib/Geo/Google

Modified Files:
	Location.pm 
Log Message:
update to specify version number of gmaps api


Index: Location.pm
===================================================================
RCS file: /cvsroot/Geo-Google/lib/Geo/Google/Location.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** Location.pm	3 Aug 2005 15:56:07 -0000	1.3
--- Location.pm	13 Mar 2006 23:22:24 -0000	1.4
***************
*** 61,65 ****
  =head1 SYNOPSIS
  
!   use Geo::Google::Location;
    # you shouldn't need to construct these yourself,
    # have a Geo::Google object do it for you. 
--- 61,65 ----
  =head1 SYNOPSIS
  
!   use Geo::Google::Point;
    # you shouldn't need to construct these yourself,
    # have a Geo::Google object do it for you.

[Nelsonlab-cmts] Geo-Google/lib/Geo Google.pm,1.2,1.3

From: <all...@su...> - 2006-03-13 23:22:29

Update of /cvsroot/Geo-Google/lib/Geo
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv18966/lib/Geo

Modified Files:
	Google.pm 
Log Message:
update to specify version number of gmaps api


Index: Google.pm
===================================================================
RCS file: /cvsroot/Geo-Google/lib/Geo/Google.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Google.pm	2 Aug 2005 05:48:12 -0000	1.2
--- Google.pm	13 Mar 2006 23:22:24 -0000	1.3
***************
*** 125,132 ****
  
  #this gets a javascript page containing map XML
! use constant LQ => 'http://maps.google.com/maps?output=js&q=%s';
  
  #this gets a javascript page containing map XML.  special for "nearby" searches
! use constant NQ => 'http://maps.google.com/maps?output=js&near=%s&q=%s';
  
  #used in polyline codec
--- 125,132 ----
  
  #this gets a javascript page containing map XML
! use constant LQ => 'http://maps.google.com/maps?output=js&v=1&q=%s';
  
  #this gets a javascript page containing map XML.  special for "nearby" searches
! use constant NQ => 'http://maps.google.com/maps?output=js&v=1&near=%s&q=%s';
  
  #used in polyline codec

[Nelsonlab-cmts] libnelson/Pg/celsius/bin profile.pl,1.5,1.6

From: <all...@su...> - 2006-03-11 02:56:06

Update of /cvsroot/libnelson/Pg/celsius/bin
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv3867/bin

Modified Files:
	profile.pl 
Log Message:
updates to use torso.  also added "-m" option (untested) to allow specification
of full profile, rather than dimension coordinates to create matrix from db
dynamically.


Index: profile.pl
===================================================================
RCS file: /cvsroot/libnelson/Pg/celsius/bin/profile.pl,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** profile.pl	3 Feb 2006 01:28:58 -0000	1.5
--- profile.pl	11 Mar 2006 02:56:02 -0000	1.6
***************
*** 7,10 ****
--- 7,11 ----
  my $cel_file;
  my $element_file;
+ my $matrix_file;
  my $verbose;
  my $help;
***************
*** 13,21 ****
              "cel|c=s"     => \$cel_file,
              "element|e=s" => \$element_file,
              "verbose|v"   => \$verbose,
              "help|h"      => \$help
  );
  
! if ( ! $cel_file or ! $element_file or $help ) {
    print <<"USAGE";
  Usage: $0 [-v] -c <file of SN accessions> -e <file of probeset identifiers>
--- 14,23 ----
              "cel|c=s"     => \$cel_file,
              "element|e=s" => \$element_file,
+             "matrix|m=s"  => \$matrix_file,
              "verbose|v"   => \$verbose,
              "help|h"      => \$help
  );
  
! if ( !($matrix_file or ($cel_file and $element_file)) or $help ) {
    print <<"USAGE";
  Usage: $0 [-v] -c <file of SN accessions> -e <file of probeset identifiers>
***************
*** 24,31 ****
  Celsius warehouse.
  
! This program takes two inputs:
    1) a list of SN identifiers as provisioned by the celsius CEL warehouse
    2) a list of Affymetrix probeset identifiers.
  
  $0 caculates Euclidean distance in a P-dimensional space where P is the number
  of probesets in the probeset identifier file.  The median values of all SN
--- 26,39 ----
  Celsius warehouse.
  
! This program can operate in two modes.
! 
! The first mode requires the -c and -e options, taking two inputs:
    1) a list of SN identifiers as provisioned by the celsius CEL warehouse
    2) a list of Affymetrix probeset identifiers.
  
+ The second mode requires the -m options, taking one input:
+   * a matrix, columns as chip IDs (your own), rows as Affymetrix probeset
+     identifiers.
+ 
  $0 caculates Euclidean distance in a P-dimensional space where P is the number
  of probesets in the probeset identifier file.  The median values of all SN
***************
*** 45,93 ****
  }
  
! my $dbh = DBI->connect('dbi:Pg:dbname=modulus;host=soleus.ctrl.ucla.edu','allenday','');
! my $element_sth = $dbh->prepare('SELECT element_id FROM element WHERE name = ?');
! my $signal_sth = $dbh->prepare("SELECT cel.db || ':' || cel.accession AS accession, element.name, result.signal FROM cel, element, result WHERE cel.cel_id = result.cel_id AND element.element_id = result.element_id AND element.element_id = ?");
  
  my @cel;
- print STDERR "reading cel file..." if $verbose;
- open(F, $cel_file);
- my @cel = <F>;
- chomp @cel;
- close(F);
- print STDERR "done\n" if $verbose;
- 
  my @element;
! print STDERR "reading element file..." if $verbose;
! open(F, $element_file);
! my @element = <F>;
! chomp @element;
! close(F);
! print STDERR "done\n" if $verbose;
  
  my %result = ();
  my %percentile = ();
- my %profile = ();
- my %element = ();
  my %sample = map {$_=>1} @cel;
  
  foreach my $e ( @element ) {
-   $element_sth->execute( $e );
-   my ( $id ) = $element_sth->fetchrow_array();
-   die "no id for $e" unless $id;
-   $element{ $e } = $id;
-   print STDERR "element_id for $e = $id\n" if $verbose;
- }
- 
- foreach my $e ( keys %element ) {
    my $full_dist = Statistics::Descriptive::Full->new();
!   my $prof_dist = Statistics::Descriptive::Full->new();
  
    my @accessions = ();
  
    print STDERR "retrieving signal for element $e..." if $verbose;
!   $signal_sth->execute( $element{ $e } );
    while ( my $row = $signal_sth->fetchrow_hashref ) {
      $full_dist->add_data( $row->{ 'signal' } );
!     if ( $sample{ $row->{ 'accession' } } ) {
        $prof_dist->add_data( $row->{ 'signal' } );
      }
--- 53,124 ----
  }
  
! my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=torso.genomics.ctrl.ucla.edu');
! $dbh->do('SET search_path TO cel, annot, part_elementresult, public');
! my $element_sth = $dbh->prepare('SELECT element_id FROM element AS e, dbxref AS x WHERE e.dbxref_id = x.dbxref_id AND x.accession = ?');
! my $signal_sth = $dbh->prepare("
! SELECT d.name || ':' || x.accession AS accession, r.signal FROM rma AS r, quantification AS q, cel AS c, cel_dbxref AS cx, dbxref AS x, db AS d WHERE r.element_id = (SELECT element_id FROM element WHERE dbxref_id = (SELECT dbxref_id FROM dbxref WHERE accession = ?)) AND r.quantification_id = q.quantification_id AND c.cel_id = q.acquisition_id AND c.cel_id = cx.cel_id AND cx.dbxref_id = x.dbxref_id AND x.db_id = d.db_id AND d.name = 'SN'
! ");
! my $annotation_sth = $dbh->prepare("SELECT DISTINCT c.name FROM cvterm AS c, dbxref AS x1, dbxref AS x2, cel_dbxref AS cx, acquisition AS q, biomaterialprop AS p WHERE c.cvterm_id = p.type_id AND p.biomaterial_id = q.assay_id AND q.acquisition_id = cx.cel_id AND cx.dbxref_id = x1.dbxref_id AND x1.accession = ? AND c.dbxref_id = x2.dbxref_id AND x2.db_id = (SELECT db_id FROM db WHERE name = ?) AND p.part != 'allenday_tumor' ORDER BY c.name");
  
  my @cel;
  my @element;
! my %matrix = ();
! 
! if ( $matrix_file ) {
!   print STDERR "reading matrix file..." if $verbose;
!   open(F, $matrix_file) or die "couldn't open matrix file '$matrix_file': $!";
!   my $cel_line = <F>;
!   chomp $cel_line;
!   @cel = split /\t/, $cel_line;
!   shift @cel;
!   while ( my $element_line = <F> ) {
!     chomp $element_line;
!     my ( $e, @signal ) = split /\t/, $element_line;
!     push @element, $e;
!     my $element_dist = Statistics::Descriptive::Full->new();
!     $element_dist->add_data( @signal );
!     $matrix{ $e } = $element_dist;
!   }
!   close(F);
! }
! else {
!   print STDERR "reading cel file..." if $verbose;
!   open(F, $cel_file) or die "couldn't open cel file '$cel_file': $!";
!   @cel = <F>;
!   chomp @cel;
!   close(F);
!   print STDERR "done\n" if $verbose;
! 
!   print STDERR "reading element file..." if $verbose;
!   open(F, $element_file) or die "couldn't open element file '$element_file': $!";
!   @element = <F>;
!   chomp @element;
!   close(F);
!   print STDERR "done\n" if $verbose;
! }
  
  my %result = ();
  my %percentile = ();
  my %sample = map {$_=>1} @cel;
+ my %profile = ();
  
  foreach my $e ( @element ) {
    my $full_dist = Statistics::Descriptive::Full->new();
! 
!   my $prof_dist;
!   if ( $matrix_file ) {
!     $prof_dist = $matrix{ $e };
!   }
!   else {
!     $prof_dist = Statistics::Descriptive::Full->new();
!   }
  
    my @accessions = ();
  
    print STDERR "retrieving signal for element $e..." if $verbose;
!   $signal_sth->execute( $e );
    while ( my $row = $signal_sth->fetchrow_hashref ) {
      $full_dist->add_data( $row->{ 'signal' } );
!     if ( (!$matrix_file) and $sample{ $row->{ 'accession' } } ) {
        $prof_dist->add_data( $row->{ 'signal' } );
      }
***************
*** 133,140 ****
    }
    $distance = sqrt( $distance );
!   push @n, [$c, $distance];
  }
  
! foreach my $n ( sort { $a->[1] <=> $b->[1] } @n ) {
!   print $n->[0] ."\t". $n->[1] ."\n";
  }
--- 164,190 ----
    }
    $distance = sqrt( $distance );
! 
! 
!   push @n, [$distance, $c];
  }
  
! foreach my $n ( sort { $a->[0] <=> $b->[0] } @n ) {
!   my $mpath     = get_annotations( $n->[1], 'MPATH' );
!   my $phenotype = get_annotations( $n->[1], 'MP' );
!   my $cell      = get_annotations( $n->[1], 'CL' );
!   my $anatomy   = get_annotations( $n->[1], 'MA' );
!   my $etc       = get_annotations( $n->[1], 'null' );
!   print join( "\t", ( @{$n}, $anatomy, $mpath, $phenotype, $cell, $etc ) ), "\n";
! }
! 
! sub get_annotations {
!   my $snid = shift;
!   my $dbspace = shift;
!   $snid =~ s/SN://;
!   $annotation_sth->execute($snid, $dbspace);
!   my @a;
!   while ( my ( $name ) = $annotation_sth->fetchrow_array() ) {
!     push @a, $name;
!   }
!   return join ';', @a; 
  }

[Nelsonlab-cmts] libnelson/java gecCel.java,1.4,1.5

From: <all...@su...> - 2006-02-28 19:49:05

Update of /cvsroot/libnelson/java
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv26404

Modified Files:
	gecCel.java 
Log Message:
get new samples first


Index: gecCel.java
===================================================================
RCS file: /cvsroot/libnelson/java/gecCel.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** gecCel.java	28 Feb 2006 19:37:25 -0000	1.4
--- gecCel.java	28 Feb 2006 19:48:56 -0000	1.5
***************
*** 17,21 ****
  public class gecCel {
    public static void main(String[] argv) {
!     String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id";
  //    String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref!
_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?";
  
--- 17,21 ----
  public class gecCel {
    public static void main(String[] argv) {
!     String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id ORDER BY sample_id DESC";
  //    String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref!
_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?";

[Nelsonlab-cmts] libnelson/java gecCel.java,1.3,1.4

From: <all...@su...> - 2006-02-28 19:37:29

Update of /cvsroot/libnelson/java
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv25298

Modified Files:
	gecCel.java 
Log Message:
removed dup attribute


Index: gecCel.java
===================================================================
RCS file: /cvsroot/libnelson/java/gecCel.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** gecCel.java	27 Feb 2006 07:58:07 -0000	1.3
--- gecCel.java	28 Feb 2006 19:37:25 -0000	1.4
***************
*** 44,48 ****
                                     "sample_id=\""     + sample_id     + "\" " +
                                     "user_id=\""       + user_id       + "\" " +
-                                    "run_id=\""        + run_id        + "\" " +
                                     "experiment_id=\"" + exp_id        + "\" " +
                               ">"); 
--- 44,47 ----

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/conf original_glioma_classification_with_vgl.xml,1.2,1.3 original_glioma_vgl_parsing_pipe.xml,1.2,1.3

From: <boc...@su...> - 2006-02-28 07:41:09

Update of /cvsroot/dev-boconnor/project_logic_analysis/conf
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv4067

Modified Files:
	original_glioma_classification_with_vgl.xml 
	original_glioma_vgl_parsing_pipe.xml 
Log Message:
I did a code review on these modules to make sure the performance I'm seeing with VGL is correct.  I found some bugs, specifically in the 1A category (due to a problem with the VGL output format) and also with the mean of the category being used rather than the other categories means.  These problems have been fixed yet the performance looks the same if not a little worse.


Index: original_glioma_classification_with_vgl.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_vgl.xml,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** original_glioma_classification_with_vgl.xml	23 Feb 2006 22:11:58 -0000	1.2
--- original_glioma_classification_with_vgl.xml	28 Feb 2006 07:41:06 -0000	1.3
***************
*** 33,37 ****
          </output>
        </step>
!       <step id="48.4" active="0" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
--- 33,37 ----
          </output>
        </step>
!       <step id="48.4" active="1" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>

Index: original_glioma_vgl_parsing_pipe.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_vgl_parsing_pipe.xml,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** original_glioma_vgl_parsing_pipe.xml	23 Feb 2006 22:11:58 -0000	1.2
--- original_glioma_vgl_parsing_pipe.xml	28 Feb 2006 07:41:06 -0000	1.3
***************
*** 7,11 ****
                    way around this would be for Marc to alter the output so the summary columns are in a fixed position.
          -->
!       <step id="30.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadVGLOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/vgl"/>
--- 7,11 ----
                    way around this would be for Marc to alter the output so the summary columns are in a fixed position.
          -->
!       <step id="30.2" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadVGLOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/vgl"/>

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job ReadVGLOutput.pm,1.8,1.9 ScoreSamplesViaVGL.pm,1.6,1.7

From: <boc...@su...> - 2006-02-28 07:36:47

Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv3630/lib/perl/Nelson/Pipe/Container/Job

Modified Files:
	ReadVGLOutput.pm ScoreSamplesViaVGL.pm 
Log Message:
I did a code review on these modules to make sure the performance I'm seeing with VGL is correct.  I found some bugs, specifically in the 1A category (due to a problem with the VGL output format) and also with the mean of the category being used rather than the other categories means.  These problems have been fixed yet the performance looks the same if not a little worse.


Index: ScoreSamplesViaVGL.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** ScoreSamplesViaVGL.pm	23 Feb 2006 22:11:59 -0000	1.6
--- ScoreSamplesViaVGL.pm	28 Feb 2006 07:36:43 -0000	1.7
***************
*** 116,120 ****
      foreach my $probe (keys %{$top_profiles->{$hc}}) {
        #print "On File: $file using HC: $hc using Probe: $probe\n";
!       my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts};
        my $sample_mean = $exp_values->{$file}{$probe};
        #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n";
--- 116,147 ----
      foreach my $probe (keys %{$top_profiles->{$hc}}) {
        #print "On File: $file using HC: $hc using Probe: $probe\n";
!       # BUG: I think this should actually be the others_mean!
!       #my $score_mean = $top_profiles->{$hc}{$probe}{mean} / $top_profiles->{$hc}{$probe}{counts};
!       my $score_mean = $top_profiles->{$hc}{$probe}{others_mean} / $top_profiles->{$hc}{$probe}{counts}; 
! 
! # just used one vgl output for this
! #The probe: 202189_x_at
! #$VAR1 = {
! #          'others_mean' => '1271.90763461538',
! #          'pvalue' => '2.13236165697335e-12',
! #          'mean' => '601.166214285714',
! #          'counts' => 3
! #        };
! 
! # something is a bit fishy, the counts above, why isn't it 1?  
! # now if I use the first 10 vgl output files:
! #The probe: 202189_x_at
! #$VAR1 = {
! #          'others_mean' => '14165.1471730769',
! #          'pvalue' => '1.16199434367298e-10',
! #          'mean' => '6678.66514285714',
! #          'counts' => 33
! #        };
! # where does 33 come from?  Also, is that an average p-value?  I need to debug where this structure is coming from!
! #print "The probe: 202189_x_at\n"; #202189_x_at
! #print Dumper $top_profiles->{$hc}{'202189_x_at'}; exit;
! # FIXME: the count problem was isolated to the parsing script and should be fixed
! 
! 
        my $sample_mean = $exp_values->{$file}{$probe};
        #print "$file $probe ".Dumper($exp_values->{$file}{$probe})."\n";
***************
*** 127,133 ****
        # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression
        # then the others
!       #if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) {
!       if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) {
          #print " Yes this is positive for $hc\n";
          $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1;
        } else {
--- 154,162 ----
        # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression
        # then the others
!       if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2 && $top_profiles->{$hc}{$probe}{mean} > $top_profiles->{$hc}{$probe}{others_mean}) {
!       #if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) {
          #print " Yes this is positive for $hc\n";
+         $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1;
+       } elsif (defined($sample_mean) && abs($sample_mean/$score_mean) <= 0.5 && $top_profiles->{$hc}{$probe}{mean} < $top_profiles->{$hc}{$probe}{others_mean}) {
          $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1;
        } else {

Index: ReadVGLOutput.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** ReadVGLOutput.pm	23 Feb 2006 22:11:58 -0000	1.8
--- ReadVGLOutput.pm	28 Feb 2006 07:36:42 -0000	1.9
***************
*** 9,12 ****
--- 9,15 ----
  # FIXME: doesn't work with multiple subdirs
  
+ # DEBUG
+ my $foo = {};
+ 
  =head2 process
  
***************
*** 123,128 ****
        foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) {
          $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info};
!         $results->{$hc}{$probe}{counts}++;
        }
      }
    }
--- 126,133 ----
        foreach my $info (sort keys %{ $reference->{$hc}{$probe}}) {
          $results->{$hc}{$probe}{$info} += $reference->{$hc}{$probe}{$info};
!         # FIXME: is there a reason the counts might not be accurate here???!?!?!
!         #$results->{$hc}{$probe}{counts}++;
        }
+       $results->{$hc}{$probe}{counts}++;
      }
    }
***************
*** 160,164 ****
  }
  
! 
  sub _parse_vgl_output {
    my ($self, $file, $offset) = @_;
--- 165,170 ----
  }
  
! # FIXME: the program that makes the VGL output reverses the order of one of the output
! # columns, make sure this is taking into account when the following code is run
  sub _parse_vgl_output {
    my ($self, $file, $offset) = @_;
***************
*** 187,195 ****
        my $pvalue = 0;
        my $mean = 0;
        my $true_count = 0;
        if ($tokens[97-$offset] eq "TRUE") {
          $HC = $classification->{97};
          $pvalue = $tokens[96-$offset];
!         $mean = $tokens[93-$offset];
          $true_count++;
        } if ($tokens[104-$offset] eq "TRUE") {
--- 193,206 ----
        my $pvalue = 0;
        my $mean = 0;
+       my $others_mean = 0;
        my $true_count = 0;
        if ($tokens[97-$offset] eq "TRUE") {
          $HC = $classification->{97};
          $pvalue = $tokens[96-$offset];
!         # BUG: the column changes here, the first entry is reversed where the mean of 1A is first and
!         # the second column 1B_2A_2B mean
!         #$mean = $tokens[93-$offset];
!         $mean = $tokens[92-$offset];
!         $others_mean = $tokens[93-$offset];
          $true_count++;
        } if ($tokens[104-$offset] eq "TRUE") {
***************
*** 197,200 ****
--- 208,212 ----
          $pvalue = $tokens[103-$offset];
          $mean = $tokens[100-$offset];
+         $others_mean = $tokens[99-$offset];
          $true_count++;
        } if ($tokens[111-$offset] eq "TRUE") {
***************
*** 202,209 ****
--- 214,223 ----
          $pvalue = $tokens[110-$offset];
          $mean = $tokens[107-$offset];
+         $others_mean = $tokens[106-$offset];
          $true_count++;
        } if ($tokens[118-$offset] eq "TRUE") {
          $HC = $classification->{118};
          $pvalue = $tokens[117-$offset];
+         $others_mean = $tokens[116-$offset];
          $mean = $tokens[114-$offset];
          $true_count++;
***************
*** 219,223 ****
          #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1;
          $result->{$HC}{$tokens[0]}{pvalue} = $pvalue;
!         $result->{$HC}{$tokens[0]}{mean}   = $mean
        }
      }
--- 233,243 ----
          #$result->{by_pvalue}{$HC}{$pvalue}{$tokens[0]} = 1;
          $result->{$HC}{$tokens[0]}{pvalue} = $pvalue;
!         $result->{$HC}{$tokens[0]}{mean}   = $mean;
!         $result->{$HC}{$tokens[0]}{others_mean} = $others_mean;
!         if ($mean > $others_mean) {
!           #print "Mean of $mean is greater than others mean of $others_mean for $tokens[0]\n";
!         } else {
!           #print "Mean of $mean is less than others mean of $others_mean for $tokens[0]\n";
!         }
        }
      }

[Nelsonlab-cmts] libnelson/java gecCel.java,1.2,1.3

From: <all...@su...> - 2006-02-27 07:58:11

Update of /cvsroot/libnelson/java
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv24819

Modified Files:
	gecCel.java 
Log Message:
add run, user, experiment ids


Index: gecCel.java
===================================================================
RCS file: /cvsroot/libnelson/java/gecCel.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** gecCel.java	17 Feb 2006 01:45:32 -0000	1.2
--- gecCel.java	27 Feb 2006 07:58:07 -0000	1.3
***************
*** 17,22 ****
  public class gecCel {
    public static void main(String[] argv) {
!     String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'";
! 
  //    String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref!
_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?";
  
--- 17,21 ----
  public class gecCel {
    public static void main(String[] argv) {
!     String SEL_EXP_RUN = "SELECT u.acs_user_id user_id, experiment.experiment_id, run.run_id, sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM sb_users u, gen_experiments experiment, gen_experiment_runs run, gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE run.run_id = sample.run_id AND type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel' AND experiment.experiment_id = run.experiment_id AND experiment.survey_taker_id = u.user_id";
  //    String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref!
_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?";
  
***************
*** 33,41 ****
        System.out.println("<files>");
        while(rs1.next()) {
!         int chip_id   = rs1.getInt("CHIP_ID");
!         String file_id   = rs1.getString("FILE_ID");
!         int sample_id = rs1.getInt("SAMPLE_ID");
  
!         System.out.println("  <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\">"); 
  
          stmt2 = conn.prepareStatement(
--- 32,50 ----
        System.out.println("<files>");
        while(rs1.next()) {
!         int chip_id    = rs1.getInt("CHIP_ID");
!         int sample_id  = rs1.getInt("SAMPLE_ID");
!         int run_id     = rs1.getInt("RUN_ID");
!         int exp_id     = rs1.getInt("EXPERIMENT_ID");
!         int user_id    = rs1.getInt("USER_ID");
!         String file_id = rs1.getString("FILE_ID");
  
!         System.out.println("  <file run_id=\""        + run_id        + "\" " +
!                                    "file_id=\""       + file_id       + "\" " +
!                                    "chip_id=\""       + chip_id       + "\" " +
!                                    "sample_id=\""     + sample_id     + "\" " +
!                                    "user_id=\""       + user_id       + "\" " +
!                                    "run_id=\""        + run_id        + "\" " +
!                                    "experiment_id=\"" + exp_id        + "\" " +
!                              ">"); 
  
          stmt2 = conn.prepareStatement(

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/conf classification_with_profiles.xml,1.1,1.2 original_glioma_classification_with_vgl.xml,1.1,1.2 original_glioma_expanded_phenotypes_pipe.xml,1.7,1.8 original_glioma_vgl_parsing_pipe.xml,1.1,1.2 p53_breast_cancer_data_Miller_et_al_2005.xml,1.1,1.2 t-cell_leukemia_data_Soulier_et_al_2005.xml,1.1,1.2

From: <boc...@su...> - 2006-02-23 22:12:13

Update of /cvsroot/dev-boconnor/project_logic_analysis/conf
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/conf

Modified Files:
	classification_with_profiles.xml 
	original_glioma_classification_with_vgl.xml 
	original_glioma_expanded_phenotypes_pipe.xml 
	original_glioma_vgl_parsing_pipe.xml 
	p53_breast_cancer_data_Miller_et_al_2005.xml 
	t-cell_leukemia_data_Soulier_et_al_2005.xml 
Log Message:
Updates


Index: t-cell_leukemia_data_Soulier_et_al_2005.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/t-cell_leukemia_data_Soulier_et_al_2005.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** t-cell_leukemia_data_Soulier_et_al_2005.xml	22 Feb 2006 09:12:24 -0000	1.1
--- t-cell_leukemia_data_Soulier_et_al_2005.xml	23 Feb 2006 22:11:58 -0000	1.2
***************
*** 1,5 ****
  <!-- Variables that are used throughout -->
  [% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %]
! [% cutoff_for_stability_percent = 10 %]
  [% dirs = ['90']  %]
  [% dir_str = '90' %]
--- 1,5 ----
  <!-- Variables that are used throughout -->
  [% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %]
! [% cutoff_for_stability_percent = 6 %]
  [% dirs = ['90']  %]
  [% dir_str = '90' %]
***************
*** 71,75 ****
       <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
        [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
          <input>
            <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
--- 71,75 ----
       <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
        [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
          <input>
            <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
***************
*** 112,116 ****
        [% FOREACH dir = dirs %]
        <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
!       <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
--- 112,116 ----
        [% FOREACH dir = dirs %]
        <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
!       <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/exportTriplets.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
***************
*** 124,128 ****
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
--- 124,128 ----
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/hypergeometric.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
***************
*** 140,144 ****
             visualiseTriplet.pl output.-->
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
          <input>
            <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
--- 140,144 ----
             visualiseTriplet.pl output.-->
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
          <input>
            <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
***************
*** 158,162 ****
        <!-- FIXME: this is redundant with what's below! -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
--- 158,162 ----
        <!-- FIXME: this is redundant with what's below! -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>

Index: classification_with_profiles.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/classification_with_profiles.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** classification_with_profiles.xml	22 Feb 2006 09:12:23 -0000	1.1
--- classification_with_profiles.xml	23 Feb 2006 22:11:58 -0000	1.2
***************
*** 9,13 ****
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
--- 9,13 ----
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
***************
*** 48,52 ****
        [% index = index + 1 %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
--- 48,52 ----
        [% index = index + 1 %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
***************
*** 66,70 ****
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
--- 66,70 ----
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
***************
*** 80,84 ****
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
--- 80,84 ----
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>

Index: original_glioma_expanded_phenotypes_pipe.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_expanded_phenotypes_pipe.xml,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** original_glioma_expanded_phenotypes_pipe.xml	16 Feb 2006 08:19:59 -0000	1.7
--- original_glioma_expanded_phenotypes_pipe.xml	23 Feb 2006 22:11:58 -0000	1.8
***************
*** 55,59 ****
  
        <!-- Code that parses the Voting Gene List output -->
!       [%# INCLUDE original_glioma_vgl_parsing_pipe.xml %]
  
        <!-- 
--- 55,59 ----
  
        <!-- Code that parses the Voting Gene List output -->
!       [% INCLUDE original_glioma_vgl_parsing_pipe.xml %]
  
        <!-- 
***************
*** 237,241 ****
          (maybe I can work with Barry on a statistical technique that doesnât require random sampling)
         -->
!       [% INCLUDE search_for_tf_binding_sites.xml %]
  
              
--- 237,241 ----
          (maybe I can work with Barry on a statistical technique that doesnât require random sampling)
         -->
!       [%# INCLUDE search_for_tf_binding_sites.xml %]
  
              
***************
*** 248,252 ****
  
        <!-- Perform the classification based on the vgl from Marc -->
!       [%# INCLUDE original_glioma_classification_with_vgl.xml %]
  
        <!-- Collect some statistics on stability and U score -->
--- 248,252 ----
  
        <!-- Perform the classification based on the vgl from Marc -->
!       [% INCLUDE original_glioma_classification_with_vgl.xml %]
  
        <!-- Collect some statistics on stability and U score -->

Index: original_glioma_classification_with_vgl.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_vgl.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** original_glioma_classification_with_vgl.xml	8 Nov 2005 02:13:47 -0000	1.1
--- original_glioma_classification_with_vgl.xml	23 Feb 2006 22:11:58 -0000	1.2
***************
*** 5,9 ****
        [% i = 1 %]
        [% FOREACH dir = dirs %]
!       <step id="48" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL">
          <input>
            <item id="list_of_files" value="data/[% datadir %]/111_glioma_classification/to_classify_file_list.txt"/>
--- 5,9 ----
        [% i = 1 %]
        [% FOREACH dir = dirs %]
!       <step id="48" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL">
          <input>
            <item id="list_of_files" value="data/[% datadir %]/111_glioma_classification/to_classify_file_list.txt"/>
***************
*** 20,24 ****
          </output>
        </step>
!       <step id="48.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="vgl_scores_for_samples"/>
--- 20,24 ----
          </output>
        </step>
!       <step id="48.2" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="vgl_scores_for_samples"/>

Index: p53_breast_cancer_data_Miller_et_al_2005.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/p53_breast_cancer_data_Miller_et_al_2005.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** p53_breast_cancer_data_Miller_et_al_2005.xml	22 Feb 2006 09:12:24 -0000	1.1
--- p53_breast_cancer_data_Miller_et_al_2005.xml	23 Feb 2006 22:11:58 -0000	1.2
***************
*** 1,5 ****
  <!-- Variables that are used throughout -->
  [% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %]
! [% cutoff_for_stability_percent = 10 %]
  [% dirs = ['75']  %]
  [% dir_str = '75' %]
--- 1,5 ----
  <!-- Variables that are used throughout -->
  [% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %]
! [% cutoff_for_stability_percent = 6 %]
  [% dirs = ['75']  %]
  [% dir_str = '75' %]
***************
*** 74,78 ****
       <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
        [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
          <input>
            <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
--- 74,78 ----
       <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
        [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
          <input>
            <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
***************
*** 115,119 ****
        [% FOREACH dir = dirs %]
        <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
!       <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
--- 115,119 ----
        [% FOREACH dir = dirs %]
        <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
!       <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/exportTriplets.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
***************
*** 127,131 ****
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
--- 127,131 ----
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="shell_command" processor="perl scripts/hypergeometric.pl">
          <processor_args>
            <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
***************
*** 143,147 ****
             visualiseTriplet.pl output.-->
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
          <input>
            <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
--- 143,147 ----
             visualiseTriplet.pl output.-->
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
          <input>
            <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
***************
*** 161,165 ****
        <!-- FIXME: this is redundant with what's below! -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
--- 161,165 ----
        <!-- FIXME: this is redundant with what's below! -->
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>

Index: original_glioma_vgl_parsing_pipe.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_vgl_parsing_pipe.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** original_glioma_vgl_parsing_pipe.xml	8 Nov 2005 02:13:48 -0000	1.1
--- original_glioma_vgl_parsing_pipe.xml	23 Feb 2006 22:11:58 -0000	1.2
***************
*** 13,16 ****
--- 13,18 ----
            <item id="col_offset" value="19"/> <!-- was 12 before, was that wrong? -->
            <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/>
+           <item id="parser_type" value="4_columns"/>
+           <item id="input_file_name" value="All_DChip_expression_vals_longNames_groupTtest.xls"/>
          </input>
          <output>
***************
*** 25,28 ****
--- 27,31 ----
            <item id="col_offset" value="32"/>
            <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/>
+           <item id="parser_type" value="2_phenotypes"/>
          </input>
          <output>
***************
*** 37,40 ****
--- 40,44 ----
            <item id="col_offset" value="52"/>
            <item id="reference_profile" value="data/[% datadir %]/vgl/100/All_DChip_expression_vals_longNames_groupTtest.xls"/>
+           <item id="parser_type" value="2_phenotypes"/>
          </input>
          <output>

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/data classification_scatterplot.sxc,1.1,1.2

From: <boc...@su...> - 2006-02-23 22:12:11

Update of /cvsroot/dev-boconnor/project_logic_analysis/data
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/data

Modified Files:
	classification_scatterplot.sxc 
Log Message:
Updates


Index: classification_scatterplot.sxc
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/data/classification_scatterplot.sxc,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
Binary files /tmp/cvsWl4QLN and /tmp/cvsUrYDrl differ

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/scripts pull_out_top_profiles.pl,NONE,1.1

From: <boc...@su...> - 2006-02-23 22:12:11

Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/scripts

Added Files:
	pull_out_top_profiles.pl 
Log Message:
Updates


--- NEW FILE: pull_out_top_profiles.pl ---
use strict;
use Data::Dumper;

my $count = shift;
my $data = {};

while(<STDIN>) {
  chomp;
  my @tokens = split /\t/;
  $data->{$tokens[7]}{$tokens[0]} = 1;
}

my @stability = sort {$a <=> $b } keys %{$data};

my $i = 0;
while ($i < $count) {
  my $stab = pop @stability;
  print "Stability = $stab\n";
  print Dumper $data->{$stab};
  $i++;
}

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job ReadVGLOutput.pm,1.7,1.8 ScoreSamplesViaVGL.pm,1.5,1.6

From: <boc...@su...> - 2006-02-23 22:12:11

Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7391/lib/perl/Nelson/Pipe/Container/Job

Modified Files:
	ReadVGLOutput.pm ScoreSamplesViaVGL.pm 
Log Message:
Updates


Index: ScoreSamplesViaVGL.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** ScoreSamplesViaVGL.pm	22 Feb 2006 09:14:59 -0000	1.5
--- ScoreSamplesViaVGL.pm	23 Feb 2006 22:11:59 -0000	1.6
***************
*** 67,71 ****
     # now loop through each profile cutoff and score each file
     foreach my $profile_count_cutoff (@profile_count_cutoffs) {
!      open SUMMARY, ">".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt" or die;
       if ($profile_count_cutoff ne 'all') {
         # read the profiles to test with here
--- 67,71 ----
     # now loop through each profile cutoff and score each file
     foreach my $profile_count_cutoff (@profile_count_cutoffs) {
!      open SUMMARY, ">".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt" or die "Can't open ".$self->{output_summary_file}."_profile_count_$profile_count_cutoff.txt for writing\n";
       if ($profile_count_cutoff ne 'all') {
         # read the profiles to test with here
***************
*** 124,128 ****
        if (!defined ($sample_mean )) { die "Sample Mean not defined for $file $probe\n"; }
        
!       if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) {
          #print " Yes this is positive for $hc\n";
          $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1;
--- 124,132 ----
        if (!defined ($sample_mean )) { die "Sample Mean not defined for $file $probe\n"; }
        
!       # BUG: I think the next line of code was incorrect!!
!       # FIXME: this actually performs worse so I think I need to keep track of which VG are actually lower expression
!       # then the others
!       #if (defined($sample_mean) && abs($sample_mean/$score_mean) >= 2) {
!       if (defined($sample_mean) && (($sample_mean/$score_mean) >= 2 || ($sample_mean/$score_mean) <= 0.5 )) {
          #print " Yes this is positive for $hc\n";
          $class->{$subdir}{$profile_count_cutoff}{$file}{scores}{$hc}{raw_score} += 1;

Index: ReadVGLOutput.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** ReadVGLOutput.pm	22 Feb 2006 09:14:59 -0000	1.7
--- ReadVGLOutput.pm	23 Feb 2006 22:11:58 -0000	1.8
***************
*** 61,65 ****
      $results->{frequency}{$subdir} = {};
      $results->{comparison_to_reference}{$subdir} = {};
- 
      # FIXME
      my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results");
--- 61,64 ----
***************
*** 68,72 ****
        my $curr_output;
        if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); }
!       else { $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); }
        # this is used to store a count/lines for all profiles encountered in this particular subdir
        $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir});
--- 67,75 ----
        my $curr_output;
        if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); }
!       else { 
! 	$curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); 
!         #print "HERE!!!!".Dumper $curr_output;
!         #die;
!       }
        # this is used to store a count/lines for all profiles encountered in this particular subdir
        $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir});

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job CreateProfiles_2.pm,NONE,1.1 CreateProfiles.pm,1.15,1.16 PPLARunner.pm,1.7,1.8 PickTopProfiles.pm,1.5,1.6 ReadProfileOutput.pm,1.8,1.9 ReadVGLOutput.pm,1.6,1.7 ScoreSamplesViaProfiles.pm,1.5,1.6 ScoreSamplesViaVGL.pm,1.4,1.5 SifFileParser.pm,1.9,1.10 SummarizeClassification.pm,1.5,1.6

From: <boc...@su...> - 2006-02-22 09:15:37

Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/lib/perl/Nelson/Pipe/Container/Job

Modified Files:
	CreateProfiles.pm PPLARunner.pm PickTopProfiles.pm 
	ReadProfileOutput.pm ReadVGLOutput.pm 
	ScoreSamplesViaProfiles.pm ScoreSamplesViaVGL.pm 
	SifFileParser.pm SummarizeClassification.pm 
Added Files:
	CreateProfiles_2.pm 
Log Message:
Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts


Index: ScoreSamplesViaProfiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaProfiles.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** ScoreSamplesViaProfiles.pm	27 Sep 2005 22:54:38 -0000	1.5
--- ScoreSamplesViaProfiles.pm	22 Feb 2006 09:14:59 -0000	1.6
***************
*** 28,32 ****
     # for each file in the test set
     my @files;
!    open LIST, $list_of_files or die;
     while(<LIST>) {
       chomp;
--- 28,32 ----
     # for each file in the test set
     my @files;
!    open LIST, $list_of_files or die "Can't open $list_of_files\n";
     while(<LIST>) {
       chomp;

Index: PickTopProfiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PickTopProfiles.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** PickTopProfiles.pm	19 Aug 2005 23:17:09 -0000	1.5
--- PickTopProfiles.pm	22 Feb 2006 09:14:58 -0000	1.6
***************
*** 31,35 ****
    my @subdirs = split /,/, $self->{profile_sub_dirs};
  
!   my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})};
  
    foreach my $subdir (@subdirs) {
--- 31,35 ----
    my @subdirs = split /,/, $self->{profile_sub_dirs};
  
!   #my @original_profiles = @{$self->_read_profiles($self->{complete_ppla_output})};
  
    foreach my $subdir (@subdirs) {
***************
*** 42,46 ****
        foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) {
          foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) {
!           #print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n";
            if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) {
              ##my $cache = $data->{all_profiles}; # done for memory issues
--- 42,46 ----
        foreach my $b (keys %{$data->{frequency}{$subdir}{tally}{$c}}) {
          foreach my $a (keys %{$data->{frequency}{$subdir}{tally}{$c}{$b}}) {
!           print "DEBUG: $c $b $a $count_min ".$data->{frequency}{$subdir}{tally}{$c}{$b}{$a}."\n";
            if ($data->{frequency}{$subdir}{tally}{$c}{$b}{$a} >= $count_min) {
              ##my $cache = $data->{all_profiles}; # done for memory issues
***************
*** 79,83 ****
    my ($self, $file) = @_;
    my @results;
!   open PROFILES, $file or die;
    while(<PROFILES>) {
      chomp;
--- 79,83 ----
    my ($self, $file) = @_;
    my @results;
!   open PROFILES, $file or die "Can't open file $file\n";
    while(<PROFILES>) {
      chomp;

--- NEW FILE: CreateProfiles_2.pm ---
package Nelson::Pipe::Container::Job::CreateProfiles_2;

use base qw(Nelson::Pipe::Container::Job);
use strict;
use Data::Dumper;
use Storable;

=head2 process

 Title   : process
 Usage   :
 Function:
 Example :
 Returns : 
 Args    :

=cut

sub process{
   my ($self,$input,$output,$stash) = @_;
   my $file_list_dir = $input->{file_list_dir}{value};
   my $file_list_sub_dir = $input->{file_list_sub_dir}{value};
   my $mas5_dir = $input->{mas5_dir}{value};
   my $profile_dir = $output->{profile_dir}{value};
   my $sif_file_hash_name = $input->{sif_file_hash}{value};
   my $sif_file_hash = $stash->{$sif_file_hash_name};
   my $start = $input->{start}{value};
   my $end = $input->{end}{value};
   my $pre_cache_mas5 = $input->{pre_cache_mas5}{value};

   my $mas5_cache_output = $output->{mas5_cache_output}{value};

   # try to load the sif_file_hash if undef
   if (!defined($sif_file_hash)) {
     $sif_file_hash = retrieve($self->{sif_file_hash_storable});
   }

   # pre-cache the mas5 results
   $self->{mas5_cache} = {};
   if ($pre_cache_mas5 eq '1') {
     $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir);
   }

   my @sub_dirs = split /,/, $file_list_sub_dir;
   foreach my $sub_dir (@sub_dirs) {
     system("mkdir -p $profile_dir/$sub_dir");
     my @files = glob("$file_list_dir/$sub_dir/*.txt");
     foreach my $list_of_cel_files (@files) {
       $list_of_cel_files =~ /_(\d+)\.txt$/;
       my $file_num = $1;
       if($file_num >= $start && $file_num < $end) {
         $self->_process_cel_files($list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_file_hash);
       }
     }
   }

   if ($pre_cache_mas5 eq '1' and defined($mas5_cache_output)) {
     store $self->{mas5_cache}, $mas5_cache_output;
   }
}

sub _pre_cache_mas5 {
  my ($self, $mas5_dir) = @_;

  my $cache = {};

  my @files = glob("$mas5_dir/*.txt");
  foreach my $file (@files) {
   open FILE, "<$file" or die "can't open $file";
   $file =~ /\/([^\/]+)\.txt$/;
   my $filename = $1;
   push @{$cache->{filenames}}, $filename;

   $file =~ s/ /_/g;
   $file =~ s/#/_/g;
   while(<FILE>) {
     my @tokens = split /\t/;
     if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) {
       #print "P-value: $tokens[5]\n"; exit;
       $tokens[4] =~ /([PAM])/;
       if ($tokens[5] <= 0.05) {
       ##if ($1 eq 'P') {
         #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1;
         push @{$cache->{probes}{$tokens[0]}}, 1;
       }
       else { 
         #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0;
         push @{$cache->{probes}{$tokens[0]}}, 0;
       }
     } elsif ($self->{parse_old_mas5} == 0) {
       #print "New pvalue: $tokens[1]\n"; exit;
       $tokens[2] =~ /([PAM])/;
       if ($tokens[1] <= 0.05) {
       ##if ($1 eq 'P') {
         #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 1;
         push @{$cache->{probes}{$tokens[0]}}, 1;
       }
       else { 
         #$cache->{probes}{$tokens[0]}{$filename}{'call'} = 0;
         push @{$cache->{probes}{$tokens[0]}}, 0;
       }
     }
   }
   close FILE;
   #print Dumper($cache); exit;
  }

  return($cache);
}

sub _process_cel_files {
  my ($self, $list_of_cel_files, $mas5_dir, $profile_dir, $sub_dir, $sif_hash) = @_;
  my $results = {};
  $list_of_cel_files =~ /file_list_(\d+)\.txt/;
  my $file_num = $1;
  open LIST, "<$list_of_cel_files" or die;
  my $i = 0;
  while(<LIST>) {
    chomp;
    my $filename = $_;
    my $old_filename = $filename;
    $filename =~ s/ /_/g;
    $filename =~ s/#/_/g;
    push @{$results->{samples}{names}}, $filename;
    #$results->{samples}{names}{$filename} = 1;
    print "Getting Annotations for $filename\n";
    $self->_get_annotations($filename, $old_filename, $results, $sif_hash);
    #print "FOO ".Dumper($results); exit;
    print "Parsing File $filename\n";
    $self->_parse_file($old_filename, $results, $mas5_dir);
    #print "On file $i $filename\n"; $i++;
  }
  close LIST;
  # at this point all the P/A calls are loaded for all the files 
  # in the list, next print it out
  print "Printing profile\n";
  my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile";
  $self->_print_profile($results, $outfile);
}

sub _get_annotations {
  my ($self, $filename, $old_filename, $results, $sif_hash) = @_;
  #print Dumper($sif_hash); exit;
  # HC
  if ($sif_hash->{'files'}{$old_filename}{hc} =~ /1A/) {
    $results->{samples}{annotations}{HC_1A}{$filename} = 1;
    $results->{samples}{annotations}{HC_1B}{$filename} = 0;
    $results->{samples}{annotations}{HC_2A}{$filename} = 0;
    $results->{samples}{annotations}{HC_2B}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /1B/) {
    $results->{samples}{annotations}{HC_1A}{$filename} = 0;
    $results->{samples}{annotations}{HC_1B}{$filename} = 1;
    $results->{samples}{annotations}{HC_2A}{$filename} = 0;
    $results->{samples}{annotations}{HC_2B}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2A/) {
    $results->{samples}{annotations}{HC_1A}{$filename} = 0;
    $results->{samples}{annotations}{HC_1B}{$filename} = 0;
    $results->{samples}{annotations}{HC_2A}{$filename} = 1;
    $results->{samples}{annotations}{HC_2B}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{hc} =~ /2B/) {
    $results->{samples}{annotations}{HC_1A}{$filename} = 0;
    $results->{samples}{annotations}{HC_1B}{$filename} = 0;
    $results->{samples}{annotations}{HC_2A}{$filename} = 0;
    $results->{samples}{annotations}{HC_2B}{$filename} = 1;
  }

  # tumor type
  if ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /mixed/) {
    $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 1;
    $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /gbm/) {
    $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 1;
    $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /oligo/) {
    $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 1;
    $results->{samples}{annotations}{tumor_type_astro}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{tumor_type} =~ /astro/) {
    $results->{samples}{annotations}{tumor_type_mixed}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_gbm}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_oligo}{$filename} = 0;
    $results->{samples}{annotations}{tumor_type_astro}{$filename} = 1;
  }

  # sex
  if ($sif_hash->{'files'}{$old_filename}{sex} =~ /F/) {
    $results->{samples}{annotations}{sex_f}{$filename} = 1;
    $results->{samples}{annotations}{sex_m}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{sex} =~ /M/) {
    $results->{samples}{annotations}{sex_f}{$filename} = 0;
    $results->{samples}{annotations}{sex_m}{$filename} = 1;
  }

  # grade
  if ($sif_hash->{'files'}{$old_filename}{grade} =~ /3/) {
    $results->{samples}{annotations}{grade_3}{$filename} = 1;
    $results->{samples}{annotations}{grade_4}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /4/) {
    $results->{samples}{annotations}{grade_3}{$filename} = 0;
    $results->{samples}{annotations}{grade_4}{$filename} = 1;
  }

  # survival cluster
  if ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC1/) {
    $results->{samples}{annotations}{survival_cluster_1}{$filename} = 1;
    $results->{samples}{annotations}{survival_cluster_2}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{survival_cluster} =~ /SC2/) {
    $results->{samples}{annotations}{survival_cluster_1}{$filename} = 0;
    $results->{samples}{annotations}{survival_cluster_2}{$filename} = 1;
  }

  # survival time
  # this is a bit more tricky because I want a profile for each survival time
  my $i=0;
  foreach my $survival_time (sort {$a <=> $b} keys %{$sif_hash->{'files_by_survival_time'}}) {
    $i++;
    if ($sif_hash->{'files'}{$old_filename}{survival_time} >= $survival_time) {
      $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 1;
    } else {
      $results->{samples}{annotations}{"survial_time_group_$i"}{$filename} = 0;
    }
  }

  # These were added for the p53 Breast Cancer Dataset Miller et al 2005
  # grade (grade 4 doesn't exist in this dataset!!)
  if ($sif_hash->{'files'}{$old_filename}{grade} =~ /G1/) {
    $results->{samples}{annotations}{grade_1}{$filename} = 1;
    $results->{samples}{annotations}{grade_2}{$filename} = 0;
    $results->{samples}{annotations}{grade_3}{$filename} = 0;
    $results->{samples}{annotations}{grade_4}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G2/) {
    $results->{samples}{annotations}{grade_1}{$filename} = 0;
    $results->{samples}{annotations}{grade_2}{$filename} = 1;
    $results->{samples}{annotations}{grade_3}{$filename} = 0;
    $results->{samples}{annotations}{grade_4}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G3/) {
    $results->{samples}{annotations}{grade_1}{$filename} = 0;
    $results->{samples}{annotations}{grade_2}{$filename} = 0;
    $results->{samples}{annotations}{grade_3}{$filename} = 1;
    $results->{samples}{annotations}{grade_4}{$filename} = 0;
  }

  # lymph_pos
  if ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /0/) {
    $results->{samples}{annotations}{lymph_pos}{$filename} = 0;
    $results->{samples}{annotations}{lymph_neg}{$filename} = 1;
  } elsif ($sif_hash->{'files'}{$old_filename}{lymph_pos} =~ /1/) {
    $results->{samples}{annotations}{lymph_pos}{$filename} = 1;
    $results->{samples}{annotations}{lymph_neg}{$filename} = 0;
  }

  # er_wt
  if ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /1/) {
    $results->{samples}{annotations}{er_wt}{$filename} = 1;
    $results->{samples}{annotations}{er_mt}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /0/) {
    $results->{samples}{annotations}{er_wt}{$filename} = 0;
    $results->{samples}{annotations}{er_mt}{$filename} = 1;
  }

  # pgr_wt
  if ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /1/) {
    $results->{samples}{annotations}{pgr_wt}{$filename} = 1;
    $results->{samples}{annotations}{pgr_mt}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /0/) {
    $results->{samples}{annotations}{pgr_wt}{$filename} = 0;
    $results->{samples}{annotations}{pgr_mt}{$filename} = 1;
  }

  # dlda_mt
  if ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /1/) {
    $results->{samples}{annotations}{dlda_wt}{$filename} = 0;
    $results->{samples}{annotations}{dlda_mt}{$filename} = 1;
  } elsif ($sif_hash->{'files'}{$old_filename}{dlda_mt} =~ /0/) {
    $results->{samples}{annotations}{dlda_wt}{$filename} = 1;
    $results->{samples}{annotations}{dlda_mt}{$filename} = 0;
  }

  # p53_wt
  if ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /1/) {
    $results->{samples}{annotations}{p53_wt}{$filename} = 1;
    $results->{samples}{annotations}{p53_mt}{$filename} = 0;
  } elsif ($sif_hash->{'files'}{$old_filename}{p53_wt} =~ /0/) {
    $results->{samples}{annotations}{p53_wt}{$filename} = 0;
    $results->{samples}{annotations}{p53_mt}{$filename} = 1;
  }

}

sub _print_profile {
  my ($self, $results, $outfile) = @_;
  open OUTFILE, ">$outfile" or die "Can't open outfile: $outfile\n";

  #print OUTFILE "samples\t".join("\t", sort keys %{$results->{samples}{names}}),"\n";
  print OUTFILE "samples\t".join("\t", @{$results->{samples}{names}}),"\n";
  #print "FOO\n";
  #print Dumper($results->{samples}); exit;
  #print Dumper($results->{samples}{annotations}); exit;
  foreach my $anno (sort keys %{$results->{samples}{annotations}}) {
      print OUTFILE "$anno\t";
      my $curr_col = 0;
      my $total_col = scalar(keys %{$results->{samples}{annotations}{$anno}});
      #foreach my $samp_file (sort keys %{$results->{samples}{annotations}{$anno}}) {
      foreach my $samp_file (@{$results->{samples}{names}}) {
        $curr_col++;
        print OUTFILE $results->{samples}{annotations}{$anno}{$samp_file};
        if ($curr_col < $total_col) { print OUTFILE "\t"; }
      }
      print OUTFILE "\n";
    }

  foreach my $probe (sort keys %{$results->{probes}}) {
    #print "Size: ".scalar(@{$results->{probes}{$probe}})."\n";
    #print OUTFILE "$probe\t".join("\t", @{$results->{probes}{$probe}}),"\n";
    print OUTFILE "$probe\t";
    #foreach my $filename (sort keys %{$results->{probes}{$probe}}) {
    foreach my $value (@{$results->{probes}{$probe}}) {
      #print OUTFILE $results->{probes}{$probe}{$filename}{'call'}, "\t";
      print OUTFILE "$value\t";
    }
    print OUTFILE "\n";
  }

  close OUTFILE;
}

sub _parse_file {
   my ($self, $file, $results, $mas5_dir) = @_;

   if (defined($self->{mas5_cache}{probes})) { # then the cache is used
     # find the offset for this file
     #print "The file is: $file\n";
     my $curr_index = 0;
     my $index = 0;
     foreach my $filename (@{$self->{mas5_cache}{filenames}}) {
       #print "  other filename: $filename\n";
       if ($filename eq $file) { $index = $curr_index; last; }
       $curr_index++;
     }
     #print "The index is: $index\n";

     # now iterate over and copy calls to results structure
     foreach my $probe (keys %{$self->{mas5_cache}{probes}}) {
       #$results->{probes}{$probe}{$file}{'call'} = $self->{mas5_cache}{probes}{$probe}[$index];
       push @{$results->{probes}{$probe}}, $self->{mas5_cache}{probes}{$probe}[$index];
     }
   } else { # FIXME: I don't think this will work anymore since moved to arrays rather than hashes

   open FILE, "<$mas5_dir/$file.txt" or die "can't open $mas5_dir/$file";
   my $filename = $file;
   $file =~ s/ /_/g;
   $file =~ s/#/_/g;
   while(<FILE>) {
     my @tokens = split /\t/;
     if ($self->{parse_old_mas5} && scalar(@tokens) > 4 && $tokens[0] !~ /^Probe/) {
       #print "P-value: $tokens[5]\n"; exit;
       $tokens[4] =~ /([PAM])/;
       if ($tokens[5] <= 0.05) {
       ##if ($1 eq 'P') {
         $results->{probes}{$tokens[0]}{$file}{'call'} = 1;
       }
       else { 
         $results->{probes}{$tokens[0]}{$file}{'call'} = 0;
       }
     } elsif ($self->{parse_old_mas5} == 0) {
       #print "New pvalue: $tokens[1]\n"; exit;
       $tokens[2] =~ /([PAM])/;
       if ($tokens[1] <= 0.05) {
       ##if ($1 eq 'P') {
         $results->{probes}{$tokens[0]}{$file}{'call'} = 1;
       }
       else { 
         $results->{probes}{$tokens[0]}{$file}{'call'} = 0;
       }
     }
   }
   close FILE;

   }

}

Index: SifFileParser.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SifFileParser.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** SifFileParser.pm	17 Feb 2006 00:09:22 -0000	1.9
--- SifFileParser.pm	22 Feb 2006 09:15:00 -0000	1.10
***************
*** 25,29 ****
  
     my $samples;
!    if ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); }
     else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); }
     $stash->{$output_hash_name} = $samples;
--- 25,31 ----
  
     my $samples;
! 
!    if ($sif_format eq 'simple') { $samples = $self->_read_simple_sample_list($sif_file); }
!    elsif ($sif_format eq 'geo') { $samples = $self->_read_geo_sample_list($sif_file, $file_map_file); }
     else { $samples = $self->_read_sample_list($sif_file, $self->{phenotypes}, $self->{col_ordering}); }
     $stash->{$output_hash_name} = $samples;
***************
*** 130,133 ****
--- 132,156 ----
    my $final_output = {'files_by_hc' => $self->{files_by_hc}, 'files' => $self->{files}, 'files_by_survival_time' => $self->{files_by_survival_time}};
    return($final_output);
+ }
+ 
+ # this just reads a three column tab file used by Marc's (Bin's) program
+ # that links filename (without extension) to phenotype. It's only useful
+ # for linking files to one phenotype class at a time
+ sub _read_simple_sample_list {
+   my ($self, $input_sample_list) = @_;
+   open INPUT, "<$input_sample_list" or die;
+   while (<INPUT>) {
+     chomp;
+     my @tokens = split /\t/;
+     my $filename = $tokens[1];
+     my $pheno = $tokens[2];
+     if ($pheno eq 'TAL_R') { $self->{files}{$filename}{TAL_R} = 1; }
+     else { $self->{files}{$filename}{TAL_R} = 0; }
+   }
+   close INPUT;
+ 
+   my $final_output = {'files' => $self->{files}};
+   return($final_output);
+ 
  }
  

Index: CreateProfiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** CreateProfiles.pm	17 Feb 2006 00:09:22 -0000	1.15
--- CreateProfiles.pm	22 Feb 2006 09:14:57 -0000	1.16
***************
*** 28,31 ****
--- 28,32 ----
     my $end = $input->{end}{value};
     my $pre_cache_mas5 = $input->{pre_cache_mas5}{value};
+    my $no_overwrite = $input->{no_overwrite}{value};
  
     my $mas5_cache_output = $output->{mas5_cache_output}{value};
***************
*** 115,118 ****
--- 116,121 ----
    $list_of_cel_files =~ /file_list_(\d+)\.txt/;
    my $file_num = $1;
+   my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile";
+   if ($self->{no_overwrite} && -f "$profile_dir/$sub_dir/file_list_$file_num.profile") { return; }
    open LIST, "<$list_of_cel_files" or die;
    my $i = 0;
***************
*** 136,140 ****
    # in the list, next print it out
    print "Printing profile\n";
-   my $outfile = "$profile_dir/$sub_dir/file_list_$file_num.profile";
    $self->_print_profile($results, $outfile);
  }
--- 139,142 ----
***************
*** 297,300 ****
--- 299,310 ----
    }
  
+   # These were added for the T-cell leukemia dataset Soulier et al 2005
+   if ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /1/) {
+     $results->{samples}{annotations}{TAL_R}{$filename} = 1;
+     $results->{samples}{annotations}{HOX_R}{$filename} = 0;
+   } elsif ($sif_hash->{'files'}{$old_filename}{TAL_R} =~ /0/) {
+     $results->{samples}{annotations}{TAL_R}{$filename} = 0;
+     $results->{samples}{annotations}{HOX_R}{$filename} = 1;
+   }
  }
  

Index: ReadVGLOutput.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadVGLOutput.pm,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** ReadVGLOutput.pm	31 Oct 2005 19:02:21 -0000	1.6
--- ReadVGLOutput.pm	22 Feb 2006 09:14:59 -0000	1.7
***************
*** 24,30 ****
--- 24,36 ----
    my $profile_output_dir = $self->{profile_output_dir};
    my $profile_output_sub_dirs = $self->{profile_output_sub_dirs};
+   my $compare_to_reference = $self->{compare_to_reference};
    my $reference_profile = $self->{reference_profile};
    my $output_dir = $self->{output_dir};
    my $col_offset = $self->{col_offset};
+   my $parser_type = $self->{parser_type};
+   my $base_col = $self->{base_col};
+   my $pheno_str_1 = $self->{pheno_str_1};
+   my $pheno_str_2 = $self->{pheno_str_2};
+   my $input_file_name = $self->{input_file_name};
  
    # the hash that holds hash and cache data
***************
*** 36,41 ****
  
    # the reference sample
!   my $reference = $self->_parse_vgl_output($reference_profile, 0);
!   $results->{parsed_output}{'100'}{reference} = $reference;
    
    # comment out, these may have already been created
--- 42,50 ----
  
    # the reference sample
!   my $reference;
!   if ($compare_to_reference) {
!     $reference = $self->_parse_vgl_output($reference_profile, 0);
!     $results->{parsed_output}{'100'}{reference} = $reference;
!   }
    
    # comment out, these may have already been created
***************
*** 56,61 ****
      my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results");
      foreach my $file (@files) {
!       print "Curr Profile: $file/All_DChip_expression_vals_longNames_groupTtest.xls\n";
!       my $curr_output = $self->_parse_vgl_output("$file/All_DChip_expression_vals_longNames_groupTtest.xls", $col_offset);
        # this is used to store a count/lines for all profiles encountered in this particular subdir
        $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir});
--- 65,72 ----
      my @files = glob("$profile_output_dir/$subdir/file_list_*.txt_results");
      foreach my $file (@files) {
!       print "Curr Profile: $file/$input_file_name\n";
!       my $curr_output;
!       if ($parser_type eq '2_phenotypes') { $curr_output = $self->_parse_2_pheno_vgl_output("$file/$input_file_name", $col_offset, $base_col, $pheno_str_1, $pheno_str_2); }
!       else { $curr_output = $self->_parse_vgl_output("$file/$input_file_name", $col_offset); }
        # this is used to store a count/lines for all profiles encountered in this particular subdir
        $self->_add_to_all_profiles($curr_output, $results->{all_profiles}{$subdir});
***************
*** 64,68 ****
        ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage
        $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set
!       $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file});
        $self->_tally_results($curr_output, $results->{frequency}{$subdir});
      }
--- 75,79 ----
        ##$results->{parsed_output}{$subdir}{$file} = $curr_output; # not used by other objects so eliminate to reduce memory usage
        $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set
!       if ($compare_to_reference) { $self->_compare_vgl_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); }
        $self->_tally_results($curr_output, $results->{frequency}{$subdir});
      }
***************
*** 75,78 ****
--- 86,90 ----
  
    # DEBUG
+   print "DEBUG DUMP!\n";
    print Dumper($results->{comparison_to_reference});
    print Dumper($results->{frequency});
***************
*** 113,116 ****
--- 125,160 ----
    }
  }
+ 
+ # only one column needs to be examined when there is one phenotype with two states
+ # FIXME: the phenotype is hardcoded here!!!
+ sub _parse_2_pheno_vgl_output {
+   my ($self, $file, $offset, $base, $pheno_str_1, $pheno_str_2) = @_;
+   open INPUT, "<$file" or die "Can't open file: $file\n";
+   my $result;
+   while(<INPUT>) {
+     if (!/^probe.set/) {
+       chomp;
+       my @tokens = split /\t/;
+       my $HC = "";
+       my $pvalue = $tokens[$base+5];
+       my $mean = $tokens[$base+3];
+       my $true_count = 0;
+       if ($tokens[$base+6] eq "TRUE" && $tokens[$base+3] >= 0) {
+         $HC = $pheno_str_1;
+         $true_count++;
+       } elsif ($tokens[$base+6] eq "TRUE" &&  $tokens[$base+3] < 0) {
+         $HC = $pheno_str_2;
+         $true_count++;
+       } 
+       if ($true_count == 1) {
+         $result->{$HC}{$tokens[0]}{pvalue} = $pvalue;
+         $result->{$HC}{$tokens[0]}{mean}   = $mean
+       }
+     }
+   }
+   close INPUT;
+   return($result);
+ }
+ 
  
  sub _parse_vgl_output {

Index: SummarizeClassification.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SummarizeClassification.pm,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** SummarizeClassification.pm	5 Oct 2005 00:04:20 -0000	1.5
--- SummarizeClassification.pm	22 Feb 2006 09:15:00 -0000	1.6
***************
*** 64,68 ****
            my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation};
            print "Highest: $highest_phenotype Correct: $correct_annotation\n";
!           if ($highest_phenotype =~ /$correct_annotation/) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; 
              print "CORRECT!!!\n";
            }
--- 64,68 ----
            my $correct_annotation = $data->{$subdir}{$number_profiles_used}{$file}{correct_annotation};
            print "Highest: $highest_phenotype Correct: $correct_annotation\n";
!           if ($highest_phenotype =~ /$correct_annotation/i) { $output->{$subdir}{$number_profiles_used}{correct_classification}++; 
              print "CORRECT!!!\n";
            }

Index: PPLARunner.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/PPLARunner.pm,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** PPLARunner.pm	8 Dec 2005 02:30:46 -0000	1.7
--- PPLARunner.pm	22 Feb 2006 09:14:58 -0000	1.8
***************
*** 21,24 ****
--- 21,25 ----
    my $start = $input->{start}{value};
    my $end = $input->{end}{value};
+   my $no_overwrite = $input->{no_overwrite}{value};
  
    foreach my $subdir (split /,/, $self->{profiles_sub_dirs}) {
***************
*** 26,30 ****
      print "DIRECTORY: ".$self->{profiles_dir}."/$subdir\n";
      foreach my $file (@files) {
-  
        $file =~ /_(\d+)\.profile$/;
        my $file_num = $1;
--- 27,30 ----
***************
*** 34,41 ****
        $file =~ /\/(\w+).profile$/;
        my $filename = $1;
        my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output";
        print STDERR "$command\n";
-       system($command);
        system("mkdir -p ".$self->{output_dir}."/".$subdir);
        # FIXME: what do I need to do to get logging working!?!?
        #$self->log("PPLARunner", $command);
--- 34,42 ----
        $file =~ /\/(\w+).profile$/;
        my $filename = $1;
+       next if ($no_overwrite && -f $self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output");
        my $command = $self->{ppla_bin}." ".$self->{entropy_filter}." ".$self->{individual_u_max}." ".$self->{together_u_min}." showNoBits ".$self->{lowA}." ".$self->{highA}." ".$self->{number_profiles}." < $file > ".$self->{output_dir}."/$subdir/$filename.".$self->{unique_id}."output";
        print STDERR "$command\n";
        system("mkdir -p ".$self->{output_dir}."/".$subdir);
+       system($command);
        # FIXME: what do I need to do to get logging working!?!?
        #$self->log("PPLARunner", $command);

Index: ScoreSamplesViaVGL.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ScoreSamplesViaVGL.pm,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** ScoreSamplesViaVGL.pm	31 Oct 2005 19:02:21 -0000	1.4
--- ScoreSamplesViaVGL.pm	22 Feb 2006 09:14:59 -0000	1.5
***************
*** 28,31 ****
--- 28,32 ----
  
     my $exp_values = $self->_read_exp_values($self->{exp_values});
+    #print Dumper($exp_values); exit;
  
     # for each file in the test set
***************
*** 144,148 ****
    while(<IN>) {
      chomp;
!     if (/^probe_set/) {
        @files = split /\t/;
        shift @files;
--- 145,149 ----
    while(<IN>) {
      chomp;
!     if (/^probe.set/) {
        @files = split /\t/;
        shift @files;

Index: ReadProfileOutput.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/ReadProfileOutput.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** ReadProfileOutput.pm	8 Dec 2005 02:30:46 -0000	1.8
--- ReadProfileOutput.pm	22 Feb 2006 09:14:59 -0000	1.9
***************
*** 23,26 ****
--- 23,27 ----
    my $profile_output_sub_dirs = $self->{profile_output_sub_dirs};
    my $reference_profile = $self->{reference_profile};
+   my $compare_to_reference = $self->{compare_to_reference};
    my $output_dir = $self->{output_dir};
    my @profiles = split /,/, $self->{profiles_to_count};
***************
*** 30,35 ****
  
    # the reference sample
!   my $reference = $self->_parse_ppla_output($reference_profile, \@profiles);
!   $results->{parsed_output}{'100'}{reference} = $reference;
  
    $Data::Dumper::Indent = 1;
--- 31,39 ----
  
    # the reference sample
!   my $reference;
!   if ($compare_to_reference) {
!     $reference = $self->_parse_ppla_output($reference_profile, \@profiles);
!     $results->{parsed_output}{'100'}{reference} = $reference;
!   }
  
    $Data::Dumper::Indent = 1;
***************
*** 66,70 ****
         
        $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set
!       $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file});
        $self->_tally_results($curr_output, $results->{frequency}{$subdir});
      }
--- 70,74 ----
         
        $results->{comparison_to_reference}{$subdir}{$file} = {}; # stores percentage overlap between refernce and curr profile set
!       if ($compare_to_reference) { $self->_compare_ppla_output($reference, $curr_output, $results->{comparison_to_reference}{$subdir}{$file}); }
        $self->_tally_results($curr_output, $results->{frequency}{$subdir});
      }
***************
*** 76,81 ****
  
    # DEBUG
    #print Dumper($results->{comparison_to_reference});
!   #print Dumper(keys %{$results->{frequency}{50}});
  }
  
--- 80,87 ----
  
    # DEBUG
+   #print Dumper (keys %{$results});
    #print Dumper($results->{comparison_to_reference});
!   print Dumper(keys %{$results->{frequency}{75}});
!   print Dumper($results->{frequency}{75});
  }

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/data classification_scatterplot.sxc,NONE,1.1

From: <boc...@su...> - 2006-02-22 09:15:07

Update of /cvsroot/dev-boconnor/project_logic_analysis/data
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/data

Added Files:
	classification_scatterplot.sxc 
Log Message:
Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts


--- NEW FILE: classification_scatterplot.sxc ---
(This appears to be a binary file; contents omitted.)

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/scripts count_vgl_output_columns.pl,NONE,1.1 fix_testset_annotations.pl,NONE,1.1 fix_testset_t-cell_annotations.pl,NONE,1.1 mkdir_data_dir.sh,NONE,1.1 test_row_length.pl,NONE,1.1

From: <boc...@su...> - 2006-02-22 09:15:07

Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351/scripts

Added Files:
	count_vgl_output_columns.pl fix_testset_annotations.pl 
	fix_testset_t-cell_annotations.pl mkdir_data_dir.sh 
	test_row_length.pl 
Log Message:
Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts


--- NEW FILE: fix_testset_t-cell_annotations.pl ---
#!/usr/local/bin/perl

use Storable;

my ($file_list, $sif_storable) = @ARGV;

my $data = retrieve($sif_storable);

open INPUT, $file_list or die;
while(<INPUT>) {
  chomp;
  print "$_\t";
  if ($data->{files}{$_}{TAL_R}) { print "TAL_R\n"; }
  else { print "HOX_R\n"; }
}

--- NEW FILE: mkdir_data_dir.sh ---
#!/bin/sh

mkdir $1
mkdir $1/hypergeometric
mkdir $1/sif
mkdir $1/analysis
# this should actually be checked in and a symlink
mkdir -p $1/analysis/templates
mkdir $1/analysis/classifications
mkdir $1/mas5
mkdir $1/ppla_output
mkdir $1/ppla_output/100
mkdir $1/profiles
mkdir $1/rand_file_lists
mkdir $1/top_profiles
mkdir $1/visualization
mkdir $1/vgl_output
# need to add sym link to dirs
mkdir $1/vgl


--- NEW FILE: test_row_length.pl ---
use Data::Dumper;

my $length = shift;

$data = {};
while(<STDIN>) {
  my @tokens = split /\s+/;
  $data->{scalar(@tokens)}++;
  if (scalar(@tokens) != $length) { print scalar(@tokens)."\t".$_; }
}

print Dumper($data);



--- NEW FILE: count_vgl_output_columns.pl ---
while(<STDIN>) {
 chomp;
 my @tokens = split /\t/;
 $i=0;
 foreach $token (@tokens) {
  #if ($token =~ /DLDA_MTtoRest_DLDA_MT_vs_DLDA_WT/) { print "$token $i\n"; }
  if ($token =~ /selected/) { print "$token $i\n"; }
  $i++;
 }
 last;
}


--- NEW FILE: fix_testset_annotations.pl ---
#!/usr/local/bin/perl

use Storable;

my ($file_list, $sif_storable) = @ARGV;

my $data = retrieve($sif_storable);

open INPUT, $file_list or die;
while(<INPUT>) {
  chomp;
  print "$_\t";
  if ($data->{files}{$_}{dlda_mt}) { print "dlda_mt\n"; }
  else { print "dlda_wt\n"; }
}

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis run_p53_breast_cancer_data_Miller_et_al_2005.sh,NONE,1.1 run_t-cell_leukemia_data_Soulier_et_al_2005.sh,NONE,1.1

From: <boc...@su...> - 2006-02-22 09:15:07

Update of /cvsroot/dev-boconnor/project_logic_analysis
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8351

Added Files:
	run_p53_breast_cancer_data_Miller_et_al_2005.sh 
	run_t-cell_leukemia_data_Soulier_et_al_2005.sh 
Log Message:
Many updates to the existing logic analysis libs and also a lot of new addtions particularly scripts


--- NEW FILE: run_p53_breast_cancer_data_Miller_et_al_2005.sh ---
#!/bin/sh

perl -I./lib/perl -I/raid5a/boconnor/cvsroot/libnelson/lib -I/raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/lib /raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/runner.pl conf/p53_breast_cancer_data_Miller_et_al_2005.xml


--- NEW FILE: run_t-cell_leukemia_data_Soulier_et_al_2005.sh ---
#!/bin/sh

perl -I./lib/perl -I/raid5a/boconnor/cvsroot/libnelson/lib -I/raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/lib /raid5a/boconnor/cvsroot/libnelson/lib/Nelson/Pipe/runner.pl conf/t-cell_leukemia_data_Soulier_et_al_2005.xml

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/conf classification_with_profiles.xml,NONE,1.1 classification_with_vgl.xml,NONE,1.1 p53_breast_cancer_data_Miller_et_al_2005.xml,NONE,1.1 t-cell_leukemia_data_Soulier_et_al_2005.xml,NONE,1.1 vgl_parsing_pipe.xml,NONE,1.1 make_profiles_and_run_la_include.xml,1.5,1.6 original_glioma_classification_with_profiles.xml,1.2,1.3

From: <boc...@su...> - 2006-02-22 09:12:28

Update of /cvsroot/dev-boconnor/project_logic_analysis/conf
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7915

Modified Files:
	make_profiles_and_run_la_include.xml 
	original_glioma_classification_with_profiles.xml 
Added Files:
	classification_with_profiles.xml classification_with_vgl.xml 
	p53_breast_cancer_data_Miller_et_al_2005.xml 
	t-cell_leukemia_data_Soulier_et_al_2005.xml 
	vgl_parsing_pipe.xml 
Log Message:
Updates and additions to the conf files.  I've tried to paramaterize these as much as possible


--- NEW FILE: t-cell_leukemia_data_Soulier_et_al_2005.xml ---
<!-- Variables that are used throughout -->
[% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %]
[% cutoff_for_stability_percent = 10 %]
[% dirs = ['90']  %]
[% dir_str = '90' %]
[% percent_to_hold_for_testset = 20 %]
[% num_profiles = 2 %]
[% total_number_profiles = 100 %]
[% times_to_repeat = 100 %]
[% profile_block_size = 10 %]
[% ppla_block_size = 10 %]
[% index = 0 %]
<project project_name="Project_Logic_Analysis"
         project_description="This project looks at understanding gene relationships using the logic
                              analysis technique created by P. Bowers and T. Yeates.  I've extended 
                              the technique to use microarray data."
         db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password="">
  <pipe pipe_name="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"
        pipe_desc="Tests the stability of networks using the t-cell leukemia dataset by Soulier et al."
        pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
        testing_mode="1">
    <settings>
      <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/>
      <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/>
      <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner"
              version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
              tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/>
      <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher"
              publish_root_dir="/raid5a/boconnor/public_html/Projects"
              publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/>
    </settings>
    <initialization>
      <plugin name="Logger"/>
      <plugin name="SystemStateRecorder"/>
      <plugin name="Versioner"/>
    </initialization>
    <run>

      [%# this is an example of a comment %]

      <!--
        Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which 
        is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
      -->
      [% sif_file = "data/t-cell_leukemia_data_Soulier_et_al_2005/sif/t-cell_sif_for_vgl.txt" %]
      [% file_map_file = "NA" %]
      [% phenotypes = "NA" %]
      [% col_ordering = "NA" %]
      [% sif_format = "simple" %]
      [% random_selection_technique = "across_all_samples" %]
      [% parse_old_mas5 = 0 %]
      [% profiles_to_count = "TAL_R,HOX_R" %]
      [% compare_to_reference = "0" %]
      [% INCLUDE make_profiles_and_run_la_include.xml %]

      <!-- Code that parses the Voting Gene List output -->
      [% index = index + 100 %]
      [% base_col = 81 %] <!-- this is the column that starts a result, it just has a column header and nothing in the rows -->
      [% input_file_name = "dChipExpr_Leukemia_groupTtest.xls" %]
      [% pheno_str_1 = "TAL_R" %]
      [% pheno_str_2 = "HOX_R" %]
      [%# INCLUDE vgl_parsing_pipe.xml %]

      <!-- 
       Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments
       and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of
       profiles and graph them out with graphviz.
      -->
     <!-- FIXME: It doesn't always find the profile in the reference set.  Need to fix this!!! -->
     <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set -->
     <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
      [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
        <input>
          <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
          <item id="parsed_output_filename"  value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="profile_sub_dirs" value="[% dir_str %]"/>
          <item id="cutoff" value="[% cutoff_for_stability_percent %]"/>
          <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used -->
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/top_profiles"/>
        </output>
      </step>

      <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest -->
      <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]


      <!-- 
        This section calls Peter's code to calculate p-values based on hypergeometric dist.
        It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the 
        visualizer to create an input to Peter's hypergeometric calculation.
        -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <arg id="2" name="" value="&gt; data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
        </processor_args>
      </step>
      <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value)
           It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value"
           entry that stores the various p-value calculations done by Peter.  This is used by FindMostConnectedNodes and 
           visualiseTriplets.pl to annotate the results.
           This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable
        -->
      <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable -->
      <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the 
           visualiseTriplet.pl output.-->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
        <input>
          <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
          <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
        </input>
        <output>
          <item id="output_file"  value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- now parse out the top profiles and collect some statistics on them -->
      <!-- FIXME: this is redundant with what's below! -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
          <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/>
        </output>
      </step>
      <!--
        This step goes through the PPLA output parsed above and counts how many times a given
        probeset is included in a triplet relationship.  It then summarizes this information into
        a hash and hands off the display of the information to a tt2.  The output is an HTML document
        that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset.
        This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question
        to visualize the individual networks with html and png output. 
        -->
      <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships -->
      <!-- FIXME: subdir is currently hardcoded inside this script!! -->
      <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes">
        <input>
          <item id="stash_input" value="stable_ppla_output_profiles"/>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdir" value="[% dir%]"/>
          <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="min_triplets" value="3"/>
          <item id="template_most_connected" value="index_for_connected_nodes.tt2"/>
          <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/>
          <item id="template_most_stable" value="index_for_connected_nodes.tt2"/>
          <item id="template_detailed" value="details_for_connected_nodes.tt2"/>
          <item id="template_dir" value="data/[% datadir %]/analysis/templates"/>
          <item id="profiles_to_count" value="[% profiles_to_count %]"/>
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- Just creates a summary page at http://humerus/project_logic_analysis -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/>
          <arg id="2" name="" value="&gt; /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/>
        </processor_args>
      </step>

      <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than
           the generic annotation bias checker below
        -->
      <!-- FIXME: includes some hardcoded elements -->
      [%# INCLUDE oncogene_counts.xml %]

      <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with
           It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset
           and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time!
           The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files
           making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each
           probeset.  The output is directed to a visualization directory.
       -->
      [%# INCLUDE build_interesting_networks.xml %]

      <!-- A very simple series of scripts that 1) pull out a non-redundant list of probesets in the most 
           stable logic triplets, 2) counts the number of probesets whos OMIM record contain one or more of a 
           collection of keyword terms, 3) compare this to X number of random trials where the same number of 
           probesets are selected randomly and the annotation bias is checked, and finally 4) treat these numbers
           of matching probesets for each trial as a normal random variable and compute a two-tailed p-value for
           the number of annotations on the original list of probesets. 
       -->
      [%# INCLUDE compare_annotation_bias.xml %]

      <!-- 
        # identify list of probesets of interest
        # for each probeset, identify genomic location via chado
        # extract upstream region of 2Kb
        # scan 2Kb region for known binding sites
        # identify factors binding these sites
        # repeat whole process X times with random lists of probesets and evaluate significance of results
        (maybe I can work with Barry on a statistical technique that doesnât require random sampling)
       -->
      [%# INCLUDE search_for_tf_binding_sites.xml %]

            

      <!-- Performs classification based on profiles (tiplet relationships). Take Z number of non-training set data and run it through a 
        prediction process where the microarray data is converted to [1|0] and each profile is assessed. If it's valid then
        the score gets a 1 otherwise -1 and normalize on the number of profiles used for that HC.  At the end there should
        be a score for each HC for a given sample, assign it to the HC with the highest score. -->
      [% profiles_to_count = "TAL_R,HOX_R" %] <!-- FIXME: this is redefined from above -->
      [% test_set_annotations_file = "test_set" %]
      [% index = index + 100 %]
      [% INCLUDE classification_with_profiles.xml %]

      <!-- Perform the classification based on the vgl from Marc -->
      <!-- All the inputs need to be defined here! -->
      [% index = index + 100 %]
      [% testset_w_annotations = "test_set_90_w_annotations.txt" %]
      [% exp_values = "data/t-cell_leukemia_data_Soulier_et_al_2005/dChipExpr_Leukemia.xls" %]
      [%# INCLUDE classification_with_vgl.xml %]
<!-- LEFT OFF HERE -->
      <!-- Collect some statistics on stability and U score -->
      [%# INCLUDE original_glioma_statistics_on_stability.xml %]

    </run>
    <cleanup>
      <plugin name="Publisher"/>
    </cleanup>
  </pipe>
</project>

--- NEW FILE: classification_with_profiles.xml ---
      <!-- 
        The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles 
        from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples,
        score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash
        where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score
        normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results
        will be summarized as correct or not and the overall predictive process will be scored.
        -->  
      <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
        <input>
          <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
        </input>
        <output>
          <item id="stash_output" value="complete_ppla_input"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
        </output>
      </step>
      <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
      [% FOREACH dir = dirs %]
      [% index = index + 1 %]
      <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
           what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
           but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
           I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
           for non-glioma datasets.
           This uses the data from SifFileParser to find the annotations.
        -->
      <!-- FIXME: THIS STEP IS A HACK!!!! -->
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/>
          <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/>
          <arg id="3" name="" value="&gt; data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
        <input>
          <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
          <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="ppla_input_stash" value="complete_ppla_input"/>
          <item id="profiles_stash" value="stable_ppla_output_profiles"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="total_profiles_to_use" value="[% num %]"/>
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
        </input>
        <output>
          <item id="stash_output" value="scores_for_samples"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="scores_for_samples"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="sample_number" value="16"/>
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!--
       THE SAME BUT RANDOMIZED! 
        -->

      <!-- 
        The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles 
        from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples,
        score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash
        where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score
        normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results
        will be summarized as correct or not and the overall predictive process will be scored.
        -->  
      <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
        <input>
          <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <item id="randomized" value="1"/>
        </input>
        <output>
          <item id="stash_output" value="complete_ppla_input_randomized"/>
        </output>
      </step>
      [% index = index + 1 %]
      <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
      [% FOREACH dir = dirs %]
      [% index = index + 1 %]
      <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
           what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
           but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
           I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
           for non-glioma datasets.
           This uses the data from SifFileParser to find the annotations.
        -->
      <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
        <input>
          <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
          <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="ppla_input_stash" value="complete_ppla_input_randomized"/>
          <item id="profiles_stash" value="stable_ppla_output_profiles"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="total_profiles_to_use" value="[% num %]"/>
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
        </input>
        <output>
          <item id="stash_output" value="scores_for_samples_randomized"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_randomized_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="scores_for_samples_randomized"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="sample_number" value="16"/>
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]

Index: make_profiles_and_run_la_include.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/make_profiles_and_run_la_include.xml,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** make_profiles_and_run_la_include.xml	17 Feb 2006 00:09:07 -0000	1.5
--- make_profiles_and_run_la_include.xml	22 Feb 2006 09:12:23 -0000	1.6
***************
*** 2,6 ****
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
--- 2,6 ----
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
***************
*** 51,54 ****
--- 51,55 ----
            <item id="end" value="[% j+profile_block_size %]"/>
            <item id="pre_cache_mas5" value="1"/>
+           <item id="no_overwrite" value="1"/>
          </input>
          <output>
***************
*** 58,63 ****
        </step>
        [% index = index + 1 %]
-       [% j = j+profile_block_size %]
        [% END %]
        [% END %]
  
--- 59,64 ----
        </step>
        [% index = index + 1 %]
        [% END %]
+       [% j = j+profile_block_size %]
        [% END %]
  
***************
*** 65,68 ****
--- 66,70 ----
        [% index = index + 1 %]
        <!-- the next two steps just read all the profiles -->
+       <!-- FIXME: LEFT OFF HERE, this dataset should include none of the testing samples!! -->
        <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
***************
*** 102,110 ****
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %].[% j %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster">
          <input>
            <item id="entropy_filter" value="3"/>
!           <item id="individual_u_max" value="0.4"/>
!           <item id="together_u_min" value="0.6"/>
            <item id="number_profiles" value="[% num_profiles %]"/>
            <item id="lowA" value="-1"/>
--- 104,112 ----
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner">
          <input>
            <item id="entropy_filter" value="3"/>
!           <item id="individual_u_max" value="0.3"/>
!           <item id="together_u_min" value="0.5"/>
            <item id="number_profiles" value="[% num_profiles %]"/>
            <item id="lowA" value="-1"/>
***************
*** 115,118 ****
--- 117,121 ----
            <item id="start" value="[% j %]"/>
            <item id="end" value="[% j+profile_block_size %]"/>
+           <item id="no_overwrite" value="1"/>
            <!-- item id="profiles_sub_dirs" value="75,90"/ -->
          </input>
***************
*** 122,128 ****
        </step>
        [% j = j+ppla_block_size %]
-       [% END %]
        [% index = index + 1 %]
        [% END %]
  
        [% index = index + 1 %]
--- 125,153 ----
        </step>
        [% j = j+ppla_block_size %]
        [% index = index + 1 %]
        [% END %]
+       [% END %]
+ 
+       [% index = index + 1 %]
+       <!-- This runs too slow! -->
+       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner">
+         <input>
+           <item id="entropy_filter" value="3"/>
+           <item id="individual_u_max" value="0.3"/>
+           <item id="together_u_min" value="0.5"/>
+           <item id="number_profiles" value="[% num_profiles %]"/>
+           <item id="lowA" value="-1"/>
+           <item id="highA" value="-1"/>
+           <item id="ppla_bin" value="bin/PPLA-1.1-255"/>
+           <item id="profiles_dir" value="data/[% datadir %]/profiles"/>
+           <item id="profiles_sub_dirs" value="100"/>
+           <item id="start" value="0"/>
+           <item id="end" value="2"/>
+         </input>
+         <output>
+           <item id="output_dir" value="data/[% datadir %]/ppla_output"/>
+         </output>
+       </step>
+ 
  
        [% index = index + 1 %]
***************
*** 135,144 ****
             https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
          -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/>
            <item id="profile_output_sub_dirs" value="[% dir_str %]"/>
            <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/>
!           <item id="profiles_to_count" value="HC_1A,HC_1B,HC_2A,HC_2B,grade_3,grade_4,sex_f,sex_m,survial_time_group_36,survial_time_group_37,survial_time_group_38,survial_time_group_43,survial_time_group_53,survial_time_group_54,survial_time_group_57,survial_time_group_31,survial_time_group_32,tumor_type_astro,tumor_type_gbm,tumor_type_mixed,tumor_type_oligo,survival_cluster_1,survival_cluster_2"/>
          </input>
          <output>
--- 160,170 ----
             https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
          -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/>
            <item id="profile_output_sub_dirs" value="[% dir_str %]"/>
            <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/>
!           <item id="profiles_to_count" value="[% profiles_to_count %]"/>
!           <item id="compare_to_reference" value="[% compare_to_100_percent_reference %]"/>
          </input>
          <output>

Index: original_glioma_classification_with_profiles.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_profiles.xml,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** original_glioma_classification_with_profiles.xml	18 Jan 2006 01:15:26 -0000	1.2
--- original_glioma_classification_with_profiles.xml	22 Feb 2006 09:12:23 -0000	1.3
***************
*** 8,12 ****
          -->  
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
!       <step id="43.1" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
--- 8,13 ----
          -->  
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
***************
*** 16,20 ****
          </output>
        </step>
!       <step id="43.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
--- 17,22 ----
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
***************
*** 27,44 ****
        </step>
        <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
-       [% i = 1 %]
        [% FOREACH dir = dirs %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="44.1[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
            <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
!           <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/test_set.txt"/>
!           <item id="list_of_phenotypes" value="HC_1A,HC_2A,HC_1B,HC_2B"/>
            <item id="ppla_input_stash" value="complete_ppla_input"/>
            <item id="profiles_stash" value="stable_ppla_output_profiles"/>
            <item id="subdir" value="[% dir %]"/>
            <item id="total_profiles_to_use" value="[% num %]"/>
!           <item id="profile_count_cutoffs" value="5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          </input>
          <output>
--- 29,62 ----
        </step>
        <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
        [% FOREACH dir = dirs %]
+       [% index = index + 1 %]
+       <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
+            what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
+            but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
+            I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
+            for non-glioma datasets.
+            This uses the data from SifFileParser to find the annotations.
+         -->
+       <!-- FIXME: THIS STEP IS A HACK!!!! -->
+       <step id="[% index %]" active="1" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl">
+         <processor_args>
+           <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/>
+           <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/>
+           <arg id="3" name="" value="&gt; data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
+         </processor_args>
+       </step>
+       [% index = index + 1 %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
            <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
!           <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
!           <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
            <item id="ppla_input_stash" value="complete_ppla_input"/>
            <item id="profiles_stash" value="stable_ppla_output_profiles"/>
            <item id="subdir" value="[% dir %]"/>
            <item id="total_profiles_to_use" value="[% num %]"/>
!           <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          </input>
          <output>
***************
*** 47,70 ****
          </output>
        </step>
!       <step id="44.2[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
            <item id="subdir" value="[% dir %]"/>
            <!-- FIXME: this template includes a hard-coded dimension!! -->
!           <item id="sample_number" value="12"/>
            <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
          </input>
          <output>
!           <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/>
            <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
          </output>
        </step>
!       <step id="44.3[% i %]" active="0" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
!           <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/>
          </processor_args>
        </step>
!       [% i = i+1 %]
        [% END %]
  
--- 65,90 ----
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
            <item id="subdir" value="[% dir %]"/>
            <!-- FIXME: this template includes a hard-coded dimension!! -->
!           <item id="sample_number" value="16"/>
            <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
          </input>
          <output>
!           <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
            <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
!           <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
          </processor_args>
        </step>
!       [% index = index + 1 %]
        [% END %]
  

--- NEW FILE: classification_with_vgl.xml ---
      <!-- Now repeat the whole process, this time use the VGL to classify samples -->
      <!-- This step uses the output from ReadVGLOutput -->
      <!-- FIXME: note hardcoded subdir here -->
      <!-- FIXME: the next three steps don't seem to work.  Somewhere the categories seems to be crossed!? -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL">
        <input>
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% testset_w_annotations %]"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="vgl_input" value="data/[% datadir %]/vgl_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- item id="profile_count_cutoffs" value="5"/ -->
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          <item id="exp_values" value="[% exp_values %]"/>
        </input>
        <output>
          <item id="stash_output" value="vgl_scores_for_samples"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_percent_vgl_based_classification_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="vgl_scores_for_samples"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="sample_number" value="16"/> <!-- this is the number of profile_count_cutoffs (w/o 'all') -->
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]


--- NEW FILE: p53_breast_cancer_data_Miller_et_al_2005.xml ---
<!-- Variables that are used throughout -->
[% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %]
[% cutoff_for_stability_percent = 10 %]
[% dirs = ['75']  %]
[% dir_str = '75' %]
[% percent_to_hold_for_testset = 35 %]
[% num_profiles = 14 %]
[% total_number_profiles = 100 %]
[% times_to_repeat = 100 %]
[% profile_block_size = 10 %]
[% ppla_block_size = 10 %]
[% index = 0 %]

<project project_name="Project_Logic_Analysis"
         project_description="This project looks at understanding gene relationships using the logic
                              analysis technique created by P. Bowers and T. Yeates.  I've extended 
                              the technique to use microarray data."
         db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password="">
  <pipe pipe_name="Logic_Analysis_Network_Stability_With_p53_Breast_Cancer_Data_Miller_et_al_Pipe"
        pipe_desc="Tests the stability of networks using the p53 breast cancer dataset by Miller et al."
        pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
        testing_mode="1">
    <settings>
      <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/>
      <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/>
      <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner"
              version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
              tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/>
      <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher"
              publish_root_dir="/raid5a/boconnor/public_html/Projects"
              publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/>
    </settings>
    <initialization>
      <plugin name="Logger"/>
      <plugin name="SystemStateRecorder"/>
      <plugin name="Versioner"/>
    </initialization>
    <run>

      [%# this is an example of a comment %]

      <!--
        Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which 
        is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
      -->
      [% sif_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_sif.txt" %]
      [% file_map_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_map.txt" %]
      [% phenotypes = "NA" %]
      [% col_ordering = "NA" %]
      [% sif_format = "geo" %]
      [% random_selection_technique = "across_all_samples" %]
      [% parse_old_mas5 = 0 %]
      [% profiles_to_count = "grade_1,grade_2,grade_3,lymph_pos,lymph_neg,er_pos,er_neg,pgr_pos,pgr_neg,dlda_wt,dlda_mt,p53_wt,p53_mt" %]
      [% compare_to_reference = "0" %]
      [% INCLUDE make_profiles_and_run_la_include.xml %]

      <!-- Code that parses the Voting Gene List output -->
<!-- LEFT OFF HERE -->
<!-- The ReadVGLOutput module needs to be reworked to remove reference to 100% dataset and also to parse output correctly -->
      [% index = index + 100 %]
      [% base_col = 104 %]
      [% input_file_name = "dChipExpr_BreastCancer_groupTtest.xls" %]
      [% pheno_str_1 = "DLDA_WT" %]
      [% pheno_str_2 = "DLDA_MT" %]
      [%# INCLUDE vgl_parsing_pipe.xml %]

      <!-- 
       Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments
       and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of
       profiles and graph them out with graphviz.
      -->
     <!-- FIXME: It doesn't always find the profile in the reference set.  Need to fix this!!! -->
     <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set -->
     <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
      [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
        <input>
          <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
          <item id="parsed_output_filename"  value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="profile_sub_dirs" value="[% dir_str %]"/>
          <item id="cutoff" value="[% cutoff_for_stability_percent %]"/>
          <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used -->
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/top_profiles"/>
        </output>
      </step>

      <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest -->
      <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]


      <!-- 
        This section calls Peter's code to calculate p-values based on hypergeometric dist.
        It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the 
        visualizer to create an input to Peter's hypergeometric calculation.
        -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <arg id="2" name="" value="&gt; data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
        </processor_args>
      </step>
      <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value)
           It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value"
           entry that stores the various p-value calculations done by Peter.  This is used by FindMostConnectedNodes and 
           visualiseTriplets.pl to annotate the results.
           This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable
        -->
      <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable -->
      <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the 
           visualiseTriplet.pl output.-->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
        <input>
          <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
          <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
        </input>
        <output>
          <item id="output_file"  value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- now parse out the top profiles and collect some statistics on them -->
      <!-- FIXME: this is redundant with what's below! -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
          <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/>
        </output>
      </step>
      <!--
        This step goes through the PPLA output parsed above and counts how many times a given
        probeset is included in a triplet relationship.  It then summarizes this information into
        a hash and hands off the display of the information to a tt2.  The output is an HTML document
        that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset.
        This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question
        to visualize the individual networks with html and png output. 
        -->
      <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships -->
      <!-- FIXME: subdir is currently hardcoded inside this script!! -->
      <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes">
        <input>
          <item id="stash_input" value="stable_ppla_output_profiles"/>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdir" value="[% dir%]"/>
          <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="min_triplets" value="3"/>
          <item id="template_most_connected" value="index_for_connected_nodes.tt2"/>
          <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/>
          <item id="template_most_stable" value="index_for_connected_nodes.tt2"/>
          <item id="template_detailed" value="details_for_connected_nodes.tt2"/>
          <item id="template_dir" value="data/[% datadir %]/analysis/templates"/>
          <item id="profiles_to_count" value="[% profiles_to_count %]"/>
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- Just creates a summary page at http://humerus/project_logic_analysis -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/>
          <arg id="2" name="" value="&gt; /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/>
        </processor_args>
      </step>

      <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than
           the generic annotation bias checker below
        -->
      <!-- FIXME: includes some hardcoded elements -->
      [%# INCLUDE oncogene_counts.xml %]

      <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with
           It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset
           and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time!
           The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files
           making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each
           probeset.  The outp...
 
[truncated message content]

[Nelsonlab-cmts] libnelson/Pg/celsius/bin gecIDsync,1.1,1.2

From: <all...@su...> - 2006-02-17 02:15:50

Update of /cvsroot/libnelson/Pg/celsius/bin
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv8931/bin

Modified Files:
	gecIDsync 
Log Message:
now loads annotations to annot.allenday_gec


Index: gecIDsync
===================================================================
RCS file: /cvsroot/libnelson/Pg/celsius/bin/gecIDsync,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** gecIDsync	31 Jan 2006 01:14:42 -0000	1.1
--- gecIDsync	17 Feb 2006 02:15:47 -0000	1.2
***************
*** 9,13 ****
  
  my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','','');
! $dbh->do('SET search_path TO cel, public');
  
  my $select1_sth = $dbh->prepare('SELECT cel_id FROM cel_dbxref, dbxref WHERE cel_dbxref.dbxref_id = dbxref.dbxref_id AND dbxref.accession LIKE ?');
--- 9,13 ----
  
  my $dbh = DBI->connect('dbi:Pg:dbname=chado-celsius;host=soleus.ctrl.ucla.edu','','');
! $dbh->do('SET search_path TO cel, annot, public');
  
  my $select1_sth = $dbh->prepare('SELECT cel_id FROM cel_dbxref, dbxref WHERE cel_dbxref.dbxref_id = dbxref.dbxref_id AND dbxref.accession LIKE ?');
***************
*** 16,19 ****
--- 16,20 ----
  my $insert1_sth = $dbh->prepare('INSERT INTO dbxref (db_id, accession) VALUES ((SELECT db_id FROM db WHERE name = ?),?)');
  my $insert2_sth = $dbh->prepare('INSERT INTO cel_dbxref (cel_id, dbxref_id) VALUES (?,?)');
+ my $insert3_sth = $dbh->prepare('INSERT INTO annot.allenday_gec (biomaterial_id, type_id) VALUES (?,(SELECT c.cvterm_id FROM cvterm AS c, dbxref AS x, db AS d WHERE c.dbxref_id = x.dbxref_id AND x.db_id = d.db_id AND x.accession = ? AND d.name = ?))');
  
  my @files = $dom->getElementsByTagName('file');
***************
*** 41,44 ****
--- 42,52 ----
    ( $x ) = $select2_sth->fetchrow_array();
    $insert2_sth->execute( $c, $x );
+ 
+   my @annots = $file->getElementsByTagName('annotation');
+   foreach my $annot ( @annots ) {
+     my $accession = $annot->getAttribute( 'accession' );
+     my ( $db, $acc ) = $accession =~ m/^(.+?):(.+?)$/;
+     $insert3_sth->execute( $c, $acc, $db );
+   }
  }

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/scripts vgl_input_script.pl,NONE,1.1 extract_probesets.sh,1.1,1.2 make_mas5.R,1.2,1.3

From: <boc...@su...> - 2006-02-17 01:57:17

Update of /cvsroot/dev-boconnor/project_logic_analysis/scripts
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7343/scripts

Modified Files:
	extract_probesets.sh make_mas5.R 
Added Files:
	vgl_input_script.pl 
Log Message:
Added quick script to make VGL sif file for Marc


Index: make_mas5.R
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/scripts/make_mas5.R,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** make_mas5.R	11 Aug 2005 23:13:11 -0000	1.2
--- make_mas5.R	17 Feb 2006 01:57:10 -0000	1.3
***************
*** 3,7 ****
  library(affy);
  
! file_list = list.files(path="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis/data/original_glioma_dataset/cel",pattern=".CEL",full.names=TRUE);
  
  for (file_name in file_list)
--- 3,7 ----
  library(affy);
  
! file_list = list.files(path="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis/data/t-cell_leukemia_data_Soulier_et_al_2005/cel",pattern=".CEL",full.names=TRUE);
  
  for (file_name in file_list)

Index: extract_probesets.sh
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/scripts/extract_probesets.sh,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** extract_probesets.sh	3 Feb 2006 04:39:30 -0000	1.1
--- extract_probesets.sh	17 Feb 2006 01:57:10 -0000	1.2
***************
*** 1,9 ****
  #!/bin/bash
  
! i=0
  while [ $i -le $1 ]
  do
!   echo "cat $2 | grep -P '^5' | grep HC_2B | awk '{ print $3 }' | sort | uniq > $3/$i.txt"
!   cat $2 | grep -P '^5' | grep HC_2B | awk '{ print $3 }' | sort | uniq > $3/$i.txt
    i=$((i+1))
  done
--- 1,20 ----
  #!/bin/bash
  
! # non-random list
! 
! echo "Extracting Non-Random List"
! echo "cat $3 | grep -P '^5' | grep $2 | awk '{ print $3 }' | sort | uniq > $5/non_random.txt"
! cat $3 | grep -P '^5' | grep $2 | awk '{ print $3 }' | sort | uniq > $5/non_random.txt
! 
! # random list
! echo "cat $4 | grep -P '^\d+' | awk '{ print $1 }' | sort | uniq > /tmp/all_probesets.txt"
! cat $4 | grep -P '^\d+' | awk '{ print $1 }' | sort | uniq > /tmp/all_probesets.txt
! 
! i=1
  while [ $i -le $1 ]
  do
!   echo "Extracting Random List $i"
!   echo "cat /tmp/all_probesets.txt | perl scripts/select_probesets_randomly.pl `wc $5/non_random.txt | awk '{ print $1 }'` > $5/random_$i.txt"
!   cat /tmp/all_probesets.txt | perl scripts/select_probesets_randomly.pl `wc $5/non_random.txt | awk '{ print $1 }'` > $5/random_$i.txt
    i=$((i+1))
  done

--- NEW FILE: vgl_input_script.pl ---
#!/usr/local/bin/perl

# to make vgl hardcoded and ugly

use Storable;

my $data = retrieve("data/p53_breast_cancer_data_Miller_et_al_2005/sif_hash.storable");

foreach my $file (sort keys %{$data->{'files'}}) {
  my $dlda_mt = $data->{'files'}{$file}{'dlda_mt'};
  my $dlda_txt = "DLDA_WT";
  if ($dlda_mt) { $dlda_txt = "DLDA_MT"; }
  print "$file\t$file\t$dlda_txt\n";
}

[Nelsonlab-cmts] libnelson/java gecCel.java,1.1,1.2

From: <all...@su...> - 2006-02-17 01:45:39

Update of /cvsroot/libnelson/java
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv6451

Modified Files:
	gecCel.java 
Log Message:
now exports annotations as well


Index: gecCel.java
===================================================================
RCS file: /cvsroot/libnelson/java/gecCel.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** gecCel.java	31 Jan 2006 00:50:02 -0000	1.1
--- gecCel.java	17 Feb 2006 01:45:32 -0000	1.2
***************
*** 19,40 ****
      String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'";
  
      Connection conn = null;
!     PreparedStatement pstmt = null;
!     ResultSet rs = null;
! 	
      try {
        conn = getConnection();
!       pstmt = conn.prepareStatement(SEL_EXP_RUN);
!       rs = pstmt.executeQuery();
        System.out.println("<files>");
!       while(rs.next()) {
!         int chip_id   = rs.getInt("CHIP_ID");
!         String file_id   = rs.getString("FILE_ID");
!         int sample_id = rs.getInt("SAMPLE_ID");
!         System.out.println("  <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\"/>"); 
  
        }
        System.out.println("</files>");
!       cleanup(conn, pstmt, rs);
      }
      catch(Exception e){
--- 19,68 ----
      String SEL_EXP_RUN = "SELECT sample.sample_id, chip.chip_id, chip.file_id || '.file' file_id FROM gen_experiment_chips sample, gen_chip_files chip, gen_file_types type WHERE type.file_type_id = chip.file_type_id AND sample.chip_id = chip.chip_id AND type.file_extension = 'cel'";
  
+ //    String SEL_ANNOT = "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref!
_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = ? UNION SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = ?";
+ 
      Connection conn = null;
!     PreparedStatement stmt1 = null;
!     PreparedStatement stmt2 = null;
!     ResultSet rs1 = null;
!     ResultSet rs2 = null;
! 
      try {
        conn = getConnection();
!       stmt1 = conn.prepareStatement(SEL_EXP_RUN);
!       rs1 = stmt1.executeQuery();
        System.out.println("<files>");
!       while(rs1.next()) {
!         int chip_id   = rs1.getInt("CHIP_ID");
!         String file_id   = rs1.getString("FILE_ID");
!         int sample_id = rs1.getInt("SAMPLE_ID");
! 
!         System.out.println("  <file file_id=\"" + file_id + "\" chip_id=\"" + chip_id + "\" sample_id=\"" + sample_id + "\">"); 
! 
!         stmt2 = conn.prepareStatement(
!           "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.cell_type_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.dev_stage_level_id = c.cvterm_id AND sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, 'null:' || c.name FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c WHERE d.cell_growth_level_id = c.cvterm_id AND sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ds d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.disease_state_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_ed d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.experiment_design_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_pt d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.phenotype_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND d.sample_id = "+sample_id+" UNION "+
!           "SELECT d.sample_id, x.accession FROM gen_annotation_exp_sample_data d, gen_annotation_cvterm c, gen_annotation_dbxref x WHERE d.rna_level_id = c.cvterm_id AND c.dbxref_id = x.dbxref_id AND sample_id = "+sample_id
!         );
! 
! //        rs2 = stmt2.executeQuery( sample_id, sample_id, sample_id, sample_id, sample_id, sample_id, sample_id );
!         rs2 = stmt2.executeQuery();
! 
!         while(rs2.next()) {
!           String accession = rs2.getString("ACCESSION");
!           System.out.println("    <annotation accession=\""+ accession +"\"/>");
!         }
  
+         rs2.close();
+         rs2 = null;
+         stmt2.close();
+ 
+         System.out.println("  </file>");
        }
        System.out.println("</files>");
!       cleanup(conn, stmt1, rs1);
      }
      catch(Exception e){
***************
*** 53,65 ****
    }
      
!   private static void cleanup(Connection conn, PreparedStatement pstmt, ResultSet rs) throws SQLException {
!     if (rs != null) {
!       rs.close();
!       rs = null;
      }
!     if (pstmt != null) {
!       pstmt.close();
!       pstmt = null;
      }
      if (conn != null) {
        conn.close();
--- 81,97 ----
    }
      
!   private static void cleanup(Connection conn, PreparedStatement stmt1, ResultSet rs1) throws SQLException {
!     if (rs1 != null) {
!       rs1.close();
!       rs1 = null;
      }
!     if (stmt1 != null) {
!       stmt1.close();
!       stmt1 = null;
      }
+ //    if (stmt2 != null) {
+ //      stmt2.close();
+ //      stmt2 = null;
+ //    }
      if (conn != null) {
        conn.close();

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job CreateProfiles.pm,1.14,1.15 SifFileParser.pm,1.8,1.9

From: <boc...@su...> - 2006-02-17 00:09:31

Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv31102/lib/perl/Nelson/Pipe/Container/Job

Modified Files:
	CreateProfiles.pm SifFileParser.pm 
Log Message:
Changes to libs


Index: SifFileParser.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/SifFileParser.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** SifFileParser.pm	16 Feb 2006 00:31:19 -0000	1.8
--- SifFileParser.pm	17 Feb 2006 00:09:22 -0000	1.9
***************
*** 167,187 ****
            my $grade = $tokens[4];
  
!           # ER status (ER+=mutant; ER-=wt)
!           my $er_wt = 1;
!           if ($tokens[5] eq 'ER+') { $er_wt = 0; }
  
            # PgR status (PgR+=mutant; PgR-=wt)
!           my $pgr_wt = 1;
!           if ($tokens[6] eq 'PgR+') { $pgr_wt = 0; }
  
            # Lymph node status 
!           my $lymph_pos = 1;
!           if ($tokens[9] eq 'LN-') { $lymph_pos = 0; }
  
            $self->{files}{$filename}{p53_wt} = $p53_wt;
            $self->{files}{$filename}{dlda_mt} = $dlda_mt;
            $self->{files}{$filename}{grade} = $grade;
!           $self->{files}{$filename}{er_wt} = $er_wt;
!           $self->{files}{$filename}{pgr_wt} = $pgr_wt;
            $self->{files}{$filename}{lymph_pos} = $lymph_pos;
        }
--- 167,187 ----
            my $grade = $tokens[4];
  
!           # ER status
!           my $er_pos = 0;
!           if ($tokens[5] eq 'ER+') { $er_pos = 1; }
  
            # PgR status (PgR+=mutant; PgR-=wt)
!           my $pgr_pos = 0;
!           if ($tokens[6] eq 'PgR+') { $pgr_pos = 1; }
  
            # Lymph node status 
!           my $lymph_pos = 0;
!           if ($tokens[9] eq 'LN+') { $lymph_pos = 1; }
  
            $self->{files}{$filename}{p53_wt} = $p53_wt;
            $self->{files}{$filename}{dlda_mt} = $dlda_mt;
            $self->{files}{$filename}{grade} = $grade;
!           $self->{files}{$filename}{er_pos} = $er_pos;
!           $self->{files}{$filename}{pgr_pos} = $pgr_pos;
            $self->{files}{$filename}{lymph_pos} = $lymph_pos;
        }

Index: CreateProfiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v
retrieving revision 1.14
retrieving revision 1.15
diff -C2 -d -r1.14 -r1.15
*** CreateProfiles.pm	16 Feb 2006 20:44:14 -0000	1.14
--- CreateProfiles.pm	17 Feb 2006 00:09:22 -0000	1.15
***************
*** 245,248 ****
--- 245,253 ----
      $results->{samples}{annotations}{grade_3}{$filename} = 1;
      $results->{samples}{annotations}{grade_4}{$filename} = 0;
+   } elsif ($sif_hash->{'files'}{$old_filename}{grade} =~ /G\?/) {
+     $results->{samples}{annotations}{grade_1}{$filename} = 0;
+     $results->{samples}{annotations}{grade_2}{$filename} = 0;
+     $results->{samples}{annotations}{grade_3}{$filename} = 0;
+     $results->{samples}{annotations}{grade_4}{$filename} = 0;
    }
  
***************
*** 256,275 ****
    }
  
!   # er_wt
!   if ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /1/) {
!     $results->{samples}{annotations}{er_wt}{$filename} = 1;
!     $results->{samples}{annotations}{er_mt}{$filename} = 0;
!   } elsif ($sif_hash->{'files'}{$old_filename}{er_wt} =~ /0/) {
!     $results->{samples}{annotations}{er_wt}{$filename} = 0;
!     $results->{samples}{annotations}{er_mt}{$filename} = 1;
    }
  
!   # pgr_wt
!   if ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /1/) {
!     $results->{samples}{annotations}{pgr_wt}{$filename} = 1;
!     $results->{samples}{annotations}{pgr_mt}{$filename} = 0;
!   } elsif ($sif_hash->{'files'}{$old_filename}{pgr_wt} =~ /0/) {
!     $results->{samples}{annotations}{pgr_wt}{$filename} = 0;
!     $results->{samples}{annotations}{pgr_mt}{$filename} = 1;
    }
  
--- 261,280 ----
    }
  
!   # er_pos
!   if ($sif_hash->{'files'}{$old_filename}{er_pos} =~ /1/) {
!     $results->{samples}{annotations}{er_pos}{$filename} = 1;
!     $results->{samples}{annotations}{er_neg}{$filename} = 0;
!   } elsif ($sif_hash->{'files'}{$old_filename}{er_pos} =~ /0/) {
!     $results->{samples}{annotations}{er_pos}{$filename} = 0;
!     $results->{samples}{annotations}{er_neg}{$filename} = 1;
    }
  
!   # pgr_pos
!   if ($sif_hash->{'files'}{$old_filename}{pgr_pos} =~ /1/) {
!     $results->{samples}{annotations}{pgr_pos}{$filename} = 1;
!     $results->{samples}{annotations}{pgr_neg}{$filename} = 0;
!   } elsif ($sif_hash->{'files'}{$old_filename}{pgr_pos} =~ /0/) {
!     $results->{samples}{annotations}{pgr_pos}{$filename} = 0;
!     $results->{samples}{annotations}{pgr_neg}{$filename} = 1;
    }

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/conf make_profiles_and_run_la_include.xml,1.4,1.5

From: <boc...@su...> - 2006-02-17 00:09:15

Update of /cvsroot/dev-boconnor/project_logic_analysis/conf
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv31072/conf

Modified Files:
	make_profiles_and_run_la_include.xml 
Log Message:
Changes to conf files


Index: make_profiles_and_run_la_include.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/make_profiles_and_run_la_include.xml,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** make_profiles_and_run_la_include.xml	16 Feb 2006 08:19:58 -0000	1.4
--- make_profiles_and_run_la_include.xml	17 Feb 2006 00:09:07 -0000	1.5
***************
*** 2,6 ****
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
--- 2,6 ----
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
***************
*** 19,23 ****
        <!-- reads the hash created by the SIF parser and randomly selects files for 10-90% of the samples -->
        <!-- FIXME: this creates rounding errors in which the number of files selected is less than it should be! -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
            <item id="sif_file_hash" value="sif_file_hash"/>
--- 19,23 ----
        <!-- reads the hash created by the SIF parser and randomly selects files for 10-90% of the samples -->
        <!-- FIXME: this creates rounding errors in which the number of files selected is less than it should be! -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
            <item id="sif_file_hash" value="sif_file_hash"/>
***************
*** 39,44 ****
        [% WHILE j < total_number_profiles %]
        [% FOREACH dir = dirs %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles">
!       <!-- step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles" execution_type="cluster" -->
          <input>
            <item id="file_list_dir" value="data/[% datadir %]/rand_file_lists"/>
--- 39,44 ----
        [% WHILE j < total_number_profiles %]
        [% FOREACH dir = dirs %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles">
!       <!-- step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles" execution_type="cluster" -->
          <input>
            <item id="file_list_dir" value="data/[% datadir %]/rand_file_lists"/>
***************
*** 62,68 ****
        [% END %]
  
        [% index = index + 1 %]
        <!-- the next two steps just read all the profiles -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
            <item id="sif_file_hash" value="sif_file_hash"/>
--- 62,69 ----
        [% END %]
  
+       <!-- large datasets will take a long time to run and need to use the special version of PPLA -->
        [% index = index + 1 %]
        <!-- the next two steps just read all the profiles -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
            <item id="sif_file_hash" value="sif_file_hash"/>
***************
*** 76,80 ****
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles">
          <input>
            <item id="file_list_dir" value="data/[% datadir %]/complete_file_list"/>
--- 77,81 ----
        </step>
        [% index = index + 1 %]
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::CreateProfiles">
          <input>
            <item id="file_list_dir" value="data/[% datadir %]/complete_file_list"/>
***************
*** 83,87 ****
            <item id="sif_file_hash" value="sif_file_hash"/>
            <item id="sif_file_hash_storable" value="data/[% datadir %]/sif_hash.storable"/>
!           <item id="parse_old_mas5" value="1"/>
            <item id="start" value="0"/>
            <item id="end" value="1"/>
--- 84,88 ----
            <item id="sif_file_hash" value="sif_file_hash"/>
            <item id="sif_file_hash_storable" value="data/[% datadir %]/sif_hash.storable"/>
!           <item id="parse_old_mas5" value="[% parse_old_mas5 %]"/>
            <item id="start" value="0"/>
            <item id="end" value="1"/>
***************
*** 101,105 ****
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %].[% j %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster">
          <input>
            <item id="entropy_filter" value="3"/>
--- 102,106 ----
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %].[% j %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster">
          <input>
            <item id="entropy_filter" value="3"/>

[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job CreateProfiles.pm,1.13,1.14 RandomlySelectFiles.pm,1.8,1.9

From: <boc...@su...> - 2006-02-16 20:44:25

Update of /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv14105/lib/perl/Nelson/Pipe/Container/Job

Modified Files:
	CreateProfiles.pm RandomlySelectFiles.pm 
Log Message:
Changed the create profile script


Index: RandomlySelectFiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/RandomlySelectFiles.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** RandomlySelectFiles.pm	16 Feb 2006 07:37:02 -0000	1.8
--- RandomlySelectFiles.pm	16 Feb 2006 20:44:14 -0000	1.9
***************
*** 68,73 ****
  
     } elsif ($random_selection_technique eq 'across_all_samples') {
! 
!      open TESTSET, ">$rand_file_list_dir/test_set_$percent_to_rand_select.txt" or die;
         my @files = shuffle keys(%{$stash->{$sif_file_hash}{'files'}});
         my $size = int(0.01 * $percent_to_hold_for_testset * scalar(@files));
--- 68,73 ----
  
     } elsif ($random_selection_technique eq 'across_all_samples') {
!      system ("mkdir -p $rand_file_list_dir");
!      open TESTSET, ">$rand_file_list_dir/test_set_$percent_to_rand_select.txt" or die "Can't open $rand_file_list_dir/test_set_$percent_to_rand_select.txt\n";
         my @files = shuffle keys(%{$stash->{$sif_file_hash}{'files'}});
         my $size = int(0.01 * $percent_to_hold_for_testset * scalar(@files));

Index: CreateProfiles.pm
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/lib/perl/Nelson/Pipe/Container/Job/CreateProfiles.pm,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** CreateProfiles.pm	16 Feb 2006 08:20:06 -0000	1.13
--- CreateProfiles.pm	16 Feb 2006 20:44:14 -0000	1.14
***************
*** 39,43 ****
     $self->{mas5_cache} = {};
     if ($pre_cache_mas5 eq '1') {
!      $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir);
     }
  
--- 39,44 ----
     $self->{mas5_cache} = {};
     if ($pre_cache_mas5 eq '1') {
!      if (-f $mas5_cache_output) { $self->{mas5_cache} = retrieve($mas5_cache_output); }
!      else { $self->{mas5_cache} = $self->_pre_cache_mas5($mas5_dir); }
     }

Flat | Threaded

1 2 > >> (Page 1 of 2)