[Nelsonlab-cmts] dev-boconnor/project_logic_analysis/conf classification_with_profiles.xml,NONE,1.1

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/dev-boconnor/project_logic_analysis/conf
In directory sumo.genetics.ucla.edu:/tmp/cvs-serv7915

Modified Files:
	make_profiles_and_run_la_include.xml 
	original_glioma_classification_with_profiles.xml 
Added Files:
	classification_with_profiles.xml classification_with_vgl.xml 
	p53_breast_cancer_data_Miller_et_al_2005.xml 
	t-cell_leukemia_data_Soulier_et_al_2005.xml 
	vgl_parsing_pipe.xml 
Log Message:
Updates and additions to the conf files.  I've tried to paramaterize these as much as possible


--- NEW FILE: t-cell_leukemia_data_Soulier_et_al_2005.xml ---
<!-- Variables that are used throughout -->
[% datadir = "t-cell_leukemia_data_Soulier_et_al_2005" %]
[% cutoff_for_stability_percent = 10 %]
[% dirs = ['90']  %]
[% dir_str = '90' %]
[% percent_to_hold_for_testset = 20 %]
[% num_profiles = 2 %]
[% total_number_profiles = 100 %]
[% times_to_repeat = 100 %]
[% profile_block_size = 10 %]
[% ppla_block_size = 10 %]
[% index = 0 %]
<project project_name="Project_Logic_Analysis"
         project_description="This project looks at understanding gene relationships using the logic
                              analysis technique created by P. Bowers and T. Yeates.  I've extended 
                              the technique to use microarray data."
         db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password="">
  <pipe pipe_name="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"
        pipe_desc="Tests the stability of networks using the t-cell leukemia dataset by Soulier et al."
        pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
        testing_mode="1">
    <settings>
      <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/>
      <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/>
      <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner"
              version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
              tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/>
      <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher"
              publish_root_dir="/raid5a/boconnor/public_html/Projects"
              publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/>
    </settings>
    <initialization>
      <plugin name="Logger"/>
      <plugin name="SystemStateRecorder"/>
      <plugin name="Versioner"/>
    </initialization>
    <run>

      [%# this is an example of a comment %]

      <!--
        Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which 
        is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
      -->
      [% sif_file = "data/t-cell_leukemia_data_Soulier_et_al_2005/sif/t-cell_sif_for_vgl.txt" %]
      [% file_map_file = "NA" %]
      [% phenotypes = "NA" %]
      [% col_ordering = "NA" %]
      [% sif_format = "simple" %]
      [% random_selection_technique = "across_all_samples" %]
      [% parse_old_mas5 = 0 %]
      [% profiles_to_count = "TAL_R,HOX_R" %]
      [% compare_to_reference = "0" %]
      [% INCLUDE make_profiles_and_run_la_include.xml %]

      <!-- Code that parses the Voting Gene List output -->
      [% index = index + 100 %]
      [% base_col = 81 %] <!-- this is the column that starts a result, it just has a column header and nothing in the rows -->
      [% input_file_name = "dChipExpr_Leukemia_groupTtest.xls" %]
      [% pheno_str_1 = "TAL_R" %]
      [% pheno_str_2 = "HOX_R" %]
      [%# INCLUDE vgl_parsing_pipe.xml %]

      <!-- 
       Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments
       and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of
       profiles and graph them out with graphviz.
      -->
     <!-- FIXME: It doesn't always find the profile in the reference set.  Need to fix this!!! -->
     <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set -->
     <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
      [% index = index + 100 %] <!-- FIXME: scoping issues with this variable!! -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
        <input>
          <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
          <item id="parsed_output_filename"  value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="profile_sub_dirs" value="[% dir_str %]"/>
          <item id="cutoff" value="[% cutoff_for_stability_percent %]"/>
          <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used -->
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/top_profiles"/>
        </output>
      </step>

      <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest -->
      <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]


      <!-- 
        This section calls Peter's code to calculate p-values based on hypergeometric dist.
        It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the 
        visualizer to create an input to Peter's hypergeometric calculation.
        -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <arg id="2" name="" value="&gt; data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
        </processor_args>
      </step>
      <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value)
           It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value"
           entry that stores the various p-value calculations done by Peter.  This is used by FindMostConnectedNodes and 
           visualiseTriplets.pl to annotate the results.
           This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable
        -->
      <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable -->
      <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the 
           visualiseTriplet.pl output.-->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
        <input>
          <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
          <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
        </input>
        <output>
          <item id="output_file"  value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- now parse out the top profiles and collect some statistics on them -->
      <!-- FIXME: this is redundant with what's below! -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
          <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/>
        </output>
      </step>
      <!--
        This step goes through the PPLA output parsed above and counts how many times a given
        probeset is included in a triplet relationship.  It then summarizes this information into
        a hash and hands off the display of the information to a tt2.  The output is an HTML document
        that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset.
        This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question
        to visualize the individual networks with html and png output. 
        -->
      <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships -->
      <!-- FIXME: subdir is currently hardcoded inside this script!! -->
      <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes">
        <input>
          <item id="stash_input" value="stable_ppla_output_profiles"/>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdir" value="[% dir%]"/>
          <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="min_triplets" value="3"/>
          <item id="template_most_connected" value="index_for_connected_nodes.tt2"/>
          <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/>
          <item id="template_most_stable" value="index_for_connected_nodes.tt2"/>
          <item id="template_detailed" value="details_for_connected_nodes.tt2"/>
          <item id="template_dir" value="data/[% datadir %]/analysis/templates"/>
          <item id="profiles_to_count" value="[% profiles_to_count %]"/>
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- Just creates a summary page at http://humerus/project_logic_analysis -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/>
          <arg id="2" name="" value="&gt; /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/>
        </processor_args>
      </step>

      <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than
           the generic annotation bias checker below
        -->
      <!-- FIXME: includes some hardcoded elements -->
      [%# INCLUDE oncogene_counts.xml %]

      <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with
           It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset
           and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time!
           The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files
           making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each
           probeset.  The output is directed to a visualization directory.
       -->
      [%# INCLUDE build_interesting_networks.xml %]

      <!-- A very simple series of scripts that 1) pull out a non-redundant list of probesets in the most 
           stable logic triplets, 2) counts the number of probesets whos OMIM record contain one or more of a 
           collection of keyword terms, 3) compare this to X number of random trials where the same number of 
           probesets are selected randomly and the annotation bias is checked, and finally 4) treat these numbers
           of matching probesets for each trial as a normal random variable and compute a two-tailed p-value for
           the number of annotations on the original list of probesets. 
       -->
      [%# INCLUDE compare_annotation_bias.xml %]

      <!-- 
        # identify list of probesets of interest
        # for each probeset, identify genomic location via chado
        # extract upstream region of 2Kb
        # scan 2Kb region for known binding sites
        # identify factors binding these sites
        # repeat whole process X times with random lists of probesets and evaluate significance of results
        (maybe I can work with Barry on a statistical technique that doesnât require random sampling)
       -->
      [%# INCLUDE search_for_tf_binding_sites.xml %]

            
      <!-- Performs classification based on profiles (tiplet relationships). Take Z number of non-training set data and run it through a 
        prediction process where the microarray data is converted to [1|0] and each profile is assessed. If it's valid then
        the score gets a 1 otherwise -1 and normalize on the number of profiles used for that HC.  At the end there should
        be a score for each HC for a given sample, assign it to the HC with the highest score. -->
      [% profiles_to_count = "TAL_R,HOX_R" %] <!-- FIXME: this is redefined from above -->
      [% test_set_annotations_file = "test_set" %]
      [% index = index + 100 %]
      [% INCLUDE classification_with_profiles.xml %]

      <!-- Perform the classification based on the vgl from Marc -->
      <!-- All the inputs need to be defined here! -->
      [% index = index + 100 %]
      [% testset_w_annotations = "test_set_90_w_annotations.txt" %]
      [% exp_values = "data/t-cell_leukemia_data_Soulier_et_al_2005/dChipExpr_Leukemia.xls" %]
      [%# INCLUDE classification_with_vgl.xml %]
<!-- LEFT OFF HERE -->
      <!-- Collect some statistics on stability and U score -->
      [%# INCLUDE original_glioma_statistics_on_stability.xml %]

    </run>
    <cleanup>
      <plugin name="Publisher"/>
    </cleanup>
  </pipe>
</project>

--- NEW FILE: classification_with_profiles.xml ---
      <!-- 
        The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles 
        from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples,
        score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash
        where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score
        normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results
        will be summarized as correct or not and the overall predictive process will be scored.
        -->  
      <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
        <input>
          <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
        </input>
        <output>
          <item id="stash_output" value="complete_ppla_input"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
        </output>
      </step>
      <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
      [% FOREACH dir = dirs %]
      [% index = index + 1 %]
      <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
           what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
           but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
           I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
           for non-glioma datasets.
           This uses the data from SifFileParser to find the annotations.
        -->
      <!-- FIXME: THIS STEP IS A HACK!!!! -->
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/>
          <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/>
          <arg id="3" name="" value="&gt; data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
        <input>
          <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
          <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="ppla_input_stash" value="complete_ppla_input"/>
          <item id="profiles_stash" value="stable_ppla_output_profiles"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="total_profiles_to_use" value="[% num %]"/>
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
        </input>
        <output>
          <item id="stash_output" value="scores_for_samples"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="scores_for_samples"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="sample_number" value="16"/>
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!--
       THE SAME BUT RANDOMIZED! 
        -->

      <!-- 
        The next steps will read the PPLA input from CreateProfiles, read the top X number of profiles 
        from PickTopProfiles, and use these profiles and expression data to, for each sample in a list of samples,
        score each sample +1 if it matches a profile and -1 if it doesn't match a profile. The output is a hash
        where the key is the sample name and the values are each phenotype (ie HC) and it's corresponding score
        normalized by the number of profiles used for that score and ranging between 1 and -1. Finally the results
        will be summarized as correct or not and the overall predictive process will be scored.
        -->  
      <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
        <input>
          <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <item id="randomized" value="1"/>
        </input>
        <output>
          <item id="stash_output" value="complete_ppla_input_randomized"/>
        </output>
      </step>
      [% index = index + 1 %]
      <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
      [% FOREACH dir = dirs %]
      [% index = index + 1 %]
      <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
           what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
           but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
           I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
           for non-glioma datasets.
           This uses the data from SifFileParser to find the annotations.
        -->
      <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
        <input>
          <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
          <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="ppla_input_stash" value="complete_ppla_input_randomized"/>
          <item id="profiles_stash" value="stable_ppla_output_profiles"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="total_profiles_to_use" value="[% num %]"/>
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
        </input>
        <output>
          <item id="stash_output" value="scores_for_samples_randomized"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_profile_based_classification_randomized_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="scores_for_samples_randomized"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="sample_number" value="16"/>
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles_randomized.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]

Index: make_profiles_and_run_la_include.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/make_profiles_and_run_la_include.xml,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** make_profiles_and_run_la_include.xml	17 Feb 2006 00:09:07 -0000	1.5
--- make_profiles_and_run_la_include.xml	22 Feb 2006 09:12:23 -0000	1.6
***************
*** 2,6 ****
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
--- 2,6 ----
        [% index = index + 1 %]
        <!-- parses the SIF file to generate a hash of file names and their HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SifFileParser">
          <input>
            <item id="sif_file" value="[% sif_file %]"/>
***************
*** 51,54 ****
--- 51,55 ----
            <item id="end" value="[% j+profile_block_size %]"/>
            <item id="pre_cache_mas5" value="1"/>
+           <item id="no_overwrite" value="1"/>
          </input>
          <output>
***************
*** 58,63 ****
        </step>
        [% index = index + 1 %]
-       [% j = j+profile_block_size %]
        [% END %]
        [% END %]
  
--- 59,64 ----
        </step>
        [% index = index + 1 %]
        [% END %]
+       [% j = j+profile_block_size %]
        [% END %]
  
***************
*** 65,68 ****
--- 66,70 ----
        [% index = index + 1 %]
        <!-- the next two steps just read all the profiles -->
+       <!-- FIXME: LEFT OFF HERE, this dataset should include none of the testing samples!! -->
        <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::RandomlySelectFiles">
          <input>
***************
*** 102,110 ****
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %].[% j %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner" execution_type="cluster">
          <input>
            <item id="entropy_filter" value="3"/>
!           <item id="individual_u_max" value="0.4"/>
!           <item id="together_u_min" value="0.6"/>
            <item id="number_profiles" value="[% num_profiles %]"/>
            <item id="lowA" value="-1"/>
--- 104,112 ----
        [% WHILE j < total_number_profiles %]
        <!-- execution_type="cluster" -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner">
          <input>
            <item id="entropy_filter" value="3"/>
!           <item id="individual_u_max" value="0.3"/>
!           <item id="together_u_min" value="0.5"/>
            <item id="number_profiles" value="[% num_profiles %]"/>
            <item id="lowA" value="-1"/>
***************
*** 115,118 ****
--- 117,121 ----
            <item id="start" value="[% j %]"/>
            <item id="end" value="[% j+profile_block_size %]"/>
+           <item id="no_overwrite" value="1"/>
            <!-- item id="profiles_sub_dirs" value="75,90"/ -->
          </input>
***************
*** 122,128 ****
        </step>
        [% j = j+ppla_block_size %]
-       [% END %]
        [% index = index + 1 %]
        [% END %]
  
        [% index = index + 1 %]
--- 125,153 ----
        </step>
        [% j = j+ppla_block_size %]
        [% index = index + 1 %]
        [% END %]
+       [% END %]
+ 
+       [% index = index + 1 %]
+       <!-- This runs too slow! -->
+       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PPLARunner">
+         <input>
+           <item id="entropy_filter" value="3"/>
+           <item id="individual_u_max" value="0.3"/>
+           <item id="together_u_min" value="0.5"/>
+           <item id="number_profiles" value="[% num_profiles %]"/>
+           <item id="lowA" value="-1"/>
+           <item id="highA" value="-1"/>
+           <item id="ppla_bin" value="bin/PPLA-1.1-255"/>
+           <item id="profiles_dir" value="data/[% datadir %]/profiles"/>
+           <item id="profiles_sub_dirs" value="100"/>
+           <item id="start" value="0"/>
+           <item id="end" value="2"/>
+         </input>
+         <output>
+           <item id="output_dir" value="data/[% datadir %]/ppla_output"/>
+         </output>
+       </step>
+ 
  
        [% index = index + 1 %]
***************
*** 135,144 ****
             https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
          -->
!       <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/>
            <item id="profile_output_sub_dirs" value="[% dir_str %]"/>
            <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/>
!           <item id="profiles_to_count" value="HC_1A,HC_1B,HC_2A,HC_2B,grade_3,grade_4,sex_f,sex_m,survial_time_group_36,survial_time_group_37,survial_time_group_38,survial_time_group_43,survial_time_group_53,survial_time_group_54,survial_time_group_57,survial_time_group_31,survial_time_group_32,tumor_type_astro,tumor_type_gbm,tumor_type_mixed,tumor_type_oligo,survival_cluster_1,survival_cluster_2"/>
          </input>
          <output>
--- 160,170 ----
             https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
          -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadProfileOutput">
          <input>
            <item id="profile_output_dir" value="data/[% datadir %]/ppla_output"/>
            <item id="profile_output_sub_dirs" value="[% dir_str %]"/>
            <item id="reference_profile" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/>
!           <item id="profiles_to_count" value="[% profiles_to_count %]"/>
!           <item id="compare_to_reference" value="[% compare_to_100_percent_reference %]"/>
          </input>
          <output>

Index: original_glioma_classification_with_profiles.xml
===================================================================
RCS file: /cvsroot/dev-boconnor/project_logic_analysis/conf/original_glioma_classification_with_profiles.xml,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** original_glioma_classification_with_profiles.xml	18 Jan 2006 01:15:26 -0000	1.2
--- original_glioma_classification_with_profiles.xml	22 Feb 2006 09:12:23 -0000	1.3
***************
*** 8,12 ****
          -->  
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
!       <step id="43.1" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
--- 8,13 ----
          -->  
        <!-- FIXME: all PPLA input files must contain /^sample/ on the first row -->
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAInputFile">
          <input>
            <item id="ppla_input_file" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
***************
*** 16,20 ****
          </output>
        </step>
!       <step id="43.2" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
--- 17,22 ----
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
          <input>
            <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
***************
*** 27,44 ****
        </step>
        <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
-       [% i = 1 %]
        [% FOREACH dir = dirs %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="44.1[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
            <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
!           <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/test_set.txt"/>
!           <item id="list_of_phenotypes" value="HC_1A,HC_2A,HC_1B,HC_2B"/>
            <item id="ppla_input_stash" value="complete_ppla_input"/>
            <item id="profiles_stash" value="stable_ppla_output_profiles"/>
            <item id="subdir" value="[% dir %]"/>
            <item id="total_profiles_to_use" value="[% num %]"/>
!           <item id="profile_count_cutoffs" value="5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          </input>
          <output>
--- 29,62 ----
        </step>
        <!-- FIXME: did I code all the logic types correctly? Also, this module contains hardcoded phenotypes. -->
        [% FOREACH dir = dirs %]
+       [% index = index + 1 %]
+       <!-- Normally, RandomlySelectFiles should produce this output that includes the filename\tannotation.  The problem is
+            what annotation to use?  In the brain tumor data it was simple: HC_1A...etc.  Here I want to use dlda phenotype
+            but I modified RandomlySelectFiles to not output the phenotype (since because of optimization it isn't stored)
+            I wrote a quick script below to append the annotation onto the file.  It should only be used when data is read
+            for non-glioma datasets.
+            This uses the data from SifFileParser to find the annotations.
+         -->
+       <!-- FIXME: THIS STEP IS A HACK!!!! -->
+       <step id="[% index %]" active="1" type="shell_command" processor="./scripts/fix_testset_t-cell_annotations.pl">
+         <processor_args>
+           <arg id="1" name="" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %].txt"/>
+           <arg id="2" name="" value="data/[% datadir %]/sif_hash.storable"/>
+           <arg id="3" name="" value="&gt; data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
+         </processor_args>
+       </step>
+       [% index = index + 1 %]
        <!-- FIXME: this needs to loop and only use the top 10,20,30,40...100 top profiles for each HC -->
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaProfiles">
          <input>
            <!-- FIXME: these phenotypes are hard coded, may need to change with different datasets -->
            <!-- FIXME: I could run this module a second time using an expanded test set with the 111 glioma samples, filtering out anything used in training. -->
!           <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% test_set_annotations_file %]_[% dir %]_w_annotations.txt"/>
!           <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
            <item id="ppla_input_stash" value="complete_ppla_input"/>
            <item id="profiles_stash" value="stable_ppla_output_profiles"/>
            <item id="subdir" value="[% dir %]"/>
            <item id="total_profiles_to_use" value="[% num %]"/>
!           <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          </input>
          <output>
***************
*** 47,70 ****
          </output>
        </step>
!       <step id="44.2[% i %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
            <item id="subdir" value="[% dir %]"/>
            <!-- FIXME: this template includes a hard-coded dimension!! -->
!           <item id="sample_number" value="12"/>
            <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
          </input>
          <output>
!           <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/>
            <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
          </output>
        </step>
!       <step id="44.3[% i %]" active="0" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
!           <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage.R"/>
          </processor_args>
        </step>
!       [% i = i+1 %]
        [% END %]
  
--- 65,90 ----
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
          <input>
            <item id="stash_input" value="scores_for_samples"/>
            <item id="subdir" value="[% dir %]"/>
            <!-- FIXME: this template includes a hard-coded dimension!! -->
!           <item id="sample_number" value="16"/>
            <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
          </input>
          <output>
!           <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
            <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
          </output>
        </step>
!       [% index = index + 1 %]
!       <step id="[% index %]" active="1" type="shell_command" processor="R">
          <processor_args>
            <arg id="1" name="" value="--vanilla"/>
!           <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_profiles.R"/>
          </processor_args>
        </step>
!       [% index = index + 1 %]
        [% END %]
  

--- NEW FILE: classification_with_vgl.xml ---
      <!-- Now repeat the whole process, this time use the VGL to classify samples -->
      <!-- This step uses the output from ReadVGLOutput -->
      <!-- FIXME: note hardcoded subdir here -->
      <!-- FIXME: the next three steps don't seem to work.  Somewhere the categories seems to be crossed!? -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::ScoreSamplesViaVGL">
        <input>
          <item id="list_of_files" value="data/[% datadir %]/rand_file_lists/[% testset_w_annotations %]"/>
          <item id="list_of_phenotypes" value="[% profiles_to_count %]"/>
          <item id="vgl_input" value="data/[% datadir %]/vgl_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
          <!-- item id="profile_count_cutoffs" value="5"/ -->
          <item id="profile_count_cutoffs" value="1,2,3,4,5,10,15,20,30,40,50,60,70,80,90,100,all"/>
          <item id="exp_values" value="[% exp_values %]"/>
        </input>
        <output>
          <item id="stash_output" value="vgl_scores_for_samples"/>
          <item id="output_summary_file" value="data/[% datadir %]/analysis/classifications/[% dir %]_percent_vgl_based_classification_summary"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="module" processor="Nelson::Pipe::Container::Job::SummarizeClassification">
        <input>
          <item id="stash_input" value="vgl_scores_for_samples"/>
          <item id="subdir" value="[% dir %]"/>
          <item id="sample_number" value="16"/> <!-- this is the number of profile_count_cutoffs (w/o 'all') -->
          <!-- FIXME: this template includes a hard-coded dimension!! -->
          <item id="R_template" value="data/[% datadir %]/analysis/templates/profile_count_vs_correct_percentage.R.tt2"/>
        </input>
        <output>
          <item id="R_file" value="data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/>
          <item id="png_file_dir" value="data/[% datadir %]/analysis/results"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="1" type="shell_command" processor="R">
        <processor_args>
          <arg id="1" name="" value="--vanilla"/>
          <arg id="2" name="" value="&lt; data/[% datadir %]/analysis/results/[% dir %]_profile_count_vs_correct_percentage_vgl.R"/>
        </processor_args>
      </step>
      [% index = index + 1 %]
      [% END %]


--- NEW FILE: p53_breast_cancer_data_Miller_et_al_2005.xml ---
<!-- Variables that are used throughout -->
[% datadir = "p53_breast_cancer_data_Miller_et_al_2005" %]
[% cutoff_for_stability_percent = 10 %]
[% dirs = ['75']  %]
[% dir_str = '75' %]
[% percent_to_hold_for_testset = 35 %]
[% num_profiles = 14 %]
[% total_number_profiles = 100 %]
[% times_to_repeat = 100 %]
[% profile_block_size = 10 %]
[% ppla_block_size = 10 %]
[% index = 0 %]

<project project_name="Project_Logic_Analysis"
         project_description="This project looks at understanding gene relationships using the logic
                              analysis technique created by P. Bowers and T. Yeates.  I've extended 
                              the technique to use microarray data."
         db_uri="dbi:Pg:host=164.67.97.78;dbname=pipe" db_user="boconnor" db_password="">
  <pipe pipe_name="Logic_Analysis_Network_Stability_With_p53_Breast_Cancer_Data_Miller_et_al_Pipe"
        pipe_desc="Tests the stability of networks using the p53 breast cancer dataset by Miller et al."
        pipe_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
        testing_mode="1">
    <settings>
      <plugin name="Logger" processor="Nelson::Pipe::Container::Plugin::Logger" log_to="db"/>
      <plugin name="SystemStateRecorder" processor="Nelson::Pipe::Container::Plugin::SystemStateRecorder"/>
      <plugin name="Versioner" processor="Nelson::Pipe::Container::Plugin::CVSVersioner"
              version_dir="/raid5a/boconnor/cvsroot/dev-boconnor/project_logic_analysis"
              tag_identifier="Logic_Analysis_Network_Stability_With_T-Cell_Leukemia_Data_Pipe"/>
      <plugin name="Publisher" processor="Nelson::Pipe::Container::Plugin::WebPublisher"
              publish_root_dir="/raid5a/boconnor/public_html/Projects"
              publish_url_prefix="http://sumo.genetics.ucla.edu/~boconnor/Projects"/>
    </settings>
    <initialization>
      <plugin name="Logger"/>
      <plugin name="SystemStateRecorder"/>
      <plugin name="Versioner"/>
    </initialization>
    <run>

      [%# this is an example of a comment %]

      <!--
        Parses SIF, makes profiles, runs LA, and parses the result into a common data structure (which 
        is detailed in my blog entry here: https://boconnor.is-a-geek.com/wiki/index.php?n=BoconnorResearchBlog.20051108
      -->
      [% sif_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_sif.txt" %]
      [% file_map_file = "data/p53_breast_cancer_data_Miller_et_al_2005/sif/p53_map.txt" %]
      [% phenotypes = "NA" %]
      [% col_ordering = "NA" %]
      [% sif_format = "geo" %]
      [% random_selection_technique = "across_all_samples" %]
      [% parse_old_mas5 = 0 %]
      [% profiles_to_count = "grade_1,grade_2,grade_3,lymph_pos,lymph_neg,er_pos,er_neg,pgr_pos,pgr_neg,dlda_wt,dlda_mt,p53_wt,p53_mt" %]
      [% compare_to_reference = "0" %]
      [% INCLUDE make_profiles_and_run_la_include.xml %]

      <!-- Code that parses the Voting Gene List output -->
<!-- LEFT OFF HERE -->
<!-- The ReadVGLOutput module needs to be reworked to remove reference to 100% dataset and also to parse output correctly -->
      [% index = index + 100 %]
      [% base_col = 104 %]
      [% input_file_name = "dChipExpr_BreastCancer_groupTtest.xls" %]
      [% pheno_str_1 = "DLDA_WT" %]
      [% pheno_str_2 = "DLDA_MT" %]
      [%# INCLUDE vgl_parsing_pipe.xml %]

      <!-- 
       Now take the logic analysis information and extract the top X profiles present in Y% or more of the experiments
       and 1) produce a sorted HTML output for it that can be browsed and 2) build up networks for the same set of
       profiles and graph them out with graphviz.
      -->
     <!-- FIXME: It doesn't always find the profile in the reference set.  Need to fix this!!! -->
     <!-- FIXME: I thought I fixed the not finding profile in ref set problem but still happens in 50% set -->
     <!-- FIXME: need to find all the profiles otherwise there won't be much to classify with -->
      [% index = index + 29 %] <!-- FIXME: scoping issues with this variable!! -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::PickTopProfiles">
        <input>
          <item id="parsed_output_stashname" value="summary_of_ppla_output"/>
          <item id="parsed_output_filename"  value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="profile_sub_dirs" value="[% dir_str %]"/>
          <item id="cutoff" value="[% cutoff_for_stability_percent %]"/>
          <item id="complete_ppla_output" value="data/[% datadir %]/ppla_output/100/file_list_0.output"/> <!-- Not used -->
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/top_profiles"/>
        </output>
      </step>

      <!-- Morgan's program (which I heavily modified) to create an HTML document to display the profiles of interest -->
      <!-- perl visualiseTriplets.pl ../glioma_data/sorted_profiles_both_annotated.out ../glioma_data/profiles.txt -eprofile_results_complete_annotations.storable > ~/public_html/project_logic_analysis/glioma_profiles/brain_profiles_logic_type_both_annotated.html -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/visualiseTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.html" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]


      <!-- 
        This section calls Peter's code to calculate p-values based on hypergeometric dist.
        It relies on the output of ReadProfileOutput and PickTopProfiles. It's a hack on the 
        visualizer to create an input to Peter's hypergeometric calculation.
        -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: does this sort the profiles? I don't do that elsewhere. Maybe I should in PickTopProfiles -->
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/exportTriplets.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <arg id="2" name="" value="data/[% datadir %]/profiles/100/file_list_0.profile"/>
          <arg id="3" name="" value="-edata/annotations/profile_results_complete_annotations.storable"/>
          <arg id="4" name="" value="&gt; data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
        </processor_args>
        <output>
          <item id="output_file" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt" publish="0"/>
        </output>
      </step>
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="perl scripts/hypergeometric.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <arg id="2" name="" value="&gt; data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
        </processor_args>
      </step>
      <!-- This script just takes the output from exportTriplets and adds some additional information (stability score and p-value)
           It also reads in the storable object (profile_data) which contains a ton of parsed data and creates a new "p-value"
           entry that stores the various p-value calculations done by Peter.  This is used by FindMostConnectedNodes and 
           visualiseTriplets.pl to annotate the results.
           This script writes the frequency and p-values back to data/[% datadir %]/ppla_output/parsed_output.storable
        -->
      <!-- FIXME: I should look for ways to consolidate the writing of p-values and freq. back to parsed_output.storable -->
      <!-- FIXME: the information contained in the output of this script is really useful and I should (somehow) add it to the 
           visualiseTriplet.pl output.-->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::AppendPValuesToExportOutput">
        <input>
          <item id="profile_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent.txt"/>
          <item id="hypergeometric_output" value="data/[% datadir %]/hypergeometric/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_hyper_probs.txt"/>
          <item id="profile_data" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="subdir" value="[% dir %]"/>
        </input>
        <output>
          <item id="output_file"  value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="output_storable" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- now parse out the top profiles and collect some statistics on them -->
      <!-- FIXME: this is redundant with what's below! -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::ReadPPLAOutputProfiles">
        <input>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdirs" value="[% dir_str %]"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
        </input>
        <output>
          <item id="stash_output" value="stable_ppla_output_profiles"/>
          <item id="stash_output_file" value="data/[% datadir %]/top_profiles/parsed_ppla_output.storable"/>
        </output>
      </step>
      <!--
        This step goes through the PPLA output parsed above and counts how many times a given
        probeset is included in a triplet relationship.  It then summarizes this information into
        a hash and hands off the display of the information to a tt2.  The output is an HTML document
        that lists the most connected genes and links to the output to visualiseTriplets.pl for each subset.
        This script is responsible for calling visualiseTriplets.pl on the subset of the triplets in question
        to visualize the individual networks with html and png output. 
        -->
      <!-- FIXME: this step should be followed with other network-based analysis on the logic relationships -->
      <!-- FIXME: subdir is currently hardcoded inside this script!! -->
      <!-- FIXME: remove the calls to other programs/scripts and move this fxn into another module -->
      [% index = index + 1 %]
      [% FOREACH dir = dirs %]
      <!-- FIXME: failing with "undef error - Can't use string ("") as a HASH ref while "strict refs" in use at lib/perl/Nelson/Pipe/Container/Job/FindMostConnectedNodes.pm line 383" -->
      <step id="[% index %]" active="0" type="module" processor="Nelson::Pipe::Container::Job::FindMostConnectedNodes">
        <input>
          <item id="stash_input" value="stable_ppla_output_profiles"/>
          <item id="ppla_output_profiles_dir" value="data/[% datadir %]/top_profiles"/>
          <item id="subdir" value="[% dir%]"/>
          <item id="extra_info" value="data/annotations/profile_results_complete_annotations.storable"/>
          <item id="filename" value="top_[% cutoff_for_stability_percent %]_percent.profiles"/>
          <item id="pvalues" value="data/[% datadir %]/ppla_output/parsed_output.storable"/>
          <item id="AppendPValuesToExportOutput_output" value="data/[% datadir %]/top_profiles/[% dir %]/[% dir %]_percent_top_[% cutoff_for_stability_percent %]_percent_w_pvalues.txt"/>
          <item id="min_triplets" value="3"/>
          <item id="template_most_connected" value="index_for_connected_nodes.tt2"/>
          <item id="template_lowest_p_value" value="index_for_connected_nodes.tt2"/>
          <item id="template_most_stable" value="index_for_connected_nodes.tt2"/>
          <item id="template_detailed" value="details_for_connected_nodes.tt2"/>
          <item id="template_dir" value="data/[% datadir %]/analysis/templates"/>
          <item id="profiles_to_count" value="[% profiles_to_count %]"/>
        </input>
        <output>
          <item id="output_dir" value="data/[% datadir %]/visualization/90/top_[% cutoff_for_stability_percent %]_percent_stable/breakdown_3_or_more"/>
        </output>
      </step>
      [% index = index + 1 %]
      [% END %]

      <!-- Just creates a summary page at http://humerus/project_logic_analysis -->
      [% index = index + 1 %]
      <step id="[% index %]" active="0" type="shell_command" processor="./scripts/wiki2html.pl">
        <processor_args>
          <arg id="1" name="" value="data/[% datadir %]/visualization/introduction.txt"/>
          <arg id="2" name="" value="&gt; /raid5a/boconnor/public_html/Projects/Project_Logic_Analysis/Logic_Analysis_Network_Stability_With_Original_Brain_Tumor_Data_Pipe/index.html"/>
        </processor_args>
      </step>

      <!-- look for a bias in the oncogene/tumor suppressor annotations. This is less flexible than
           the generic annotation bias checker below
        -->
      <!-- FIXME: includes some hardcoded elements -->
      [%# INCLUDE oncogene_counts.xml %]

      <!-- This series of scripts takes an input list of "interesting" probesets and build a network of what they connect with
           It annotates those probesets using the Affy array information file and then colors the nodes as green if a "!" probeset
           and red if it's expressed. FIXME: this assumption only works if the network is built with one phenotype at a time!
           The nodes then link back to the summary HTML descriptions and the edges link to records within the HTML description files
           making it easy to see the actual relationship, binary profiles, and additional free text annotations associated with each
           probeset.  The outp...
 
[truncated message content]