[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles blastfiles.xml,NONE,1.1 chadofeatconv.xml,NONE,1.

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/gmod/schema/GMODTools/conf/bulkfiles
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13108/conf/bulkfiles

Added Files:
	blastfiles.xml chadofeatconv.xml chadofeatsql.xml 
	dmelhetfeatconv.xml fbbulk-r3h.xml fbbulk-r4.xml 
	gbrowseconf.xml sgdbulk1.xml sgdfeatconf.xml tognomap.xml 
Log Message:
more details worked out; good for dmel, tested w/ sgd chado dbs

--- NEW FILE: dmelhetfeatconv.xml ---
<opt
  name="dmelhetfeatconv"
  date="20040821"
  >

  <title>Chado DB Feature info</title>
  <about>
    Use this one for D.melanogaster heterochromatin genome db.

    These are configurations for converting chado feature table dumps to
    standard feature/sequence files. Much of below specifies how to
    process different features (tied to methods in ChadoFeatDump.pm

    These configs should be data-set independent.
    This works with, but is independent of SeqUtil2 configs.
  </about>

  <informat>feature_table</informat> <!-- feature_table ? -->

  <outformats>fff</outformats>
  <outformats>gff</outformats>
  <outformats>fasta</outformats>

  <!-- copied from db-release config files .. this is mostly common info,
    but db-release config can override @featset, %featmap -->

  <!-- feature sets to make fasta bulk files -->
  <featset>gene</featset>
  <featset>mRNA</featset>
  <featset>CDS</featset>
  <featset>transcript</featset>
  <featset>translation</featset>
  <featset>tRNA</featset>
  <featset>miscRNA</featset>
  <featset>transposon</featset>
  <featset>pseudogene</featset>
  <featset>gene_extended2000</featset>
  <featset>five_prime_UTR</featset>
  <featset>three_prime_UTR</featset>
  <featset>intron</featset>
  <featset>intergenic</featset>
  <featset>scaffold</featset>

  <!-- 
   featmap for feature sets that need reprocessing, attributes: 
    name = feature set name
    types = feature types, space delimited (in fff/gff)
    typelabel = type to use in output header, paired with types list
    subrange = expansion/extraction range to add to feature location;
             readseq syntax not yet supported, i.e. {start,end}+/-offset)
    fromdb = extract from chado database feature.residues field
             rather than chromosome dna file,
             for curated residues - transcript, translation)
    get_id = save id, dbxref fields for add to other (genemodel) features
    add_id = add id of parent feature (genemodel)
    note: translation type=protein is coded into various seq fetch programs
  -->

  <featmap name="gene" get_id="1" />
  <featmap name="five_prime_UTR" add_id="gene" />
  <featmap name="three_prime_UTR" add_id="gene" />
  <featmap name="intron" add_id="gene" />
  <featmap name="mRNA" add_id="gene" />
  <featmap name="CDS" add_id="gene" />

  <featmap
    name="translation"
    types="CDS"
    typelabel="protein"
    fromdb="1"
    />
  <featmap
    name="transcript"
    types="mRNA"
    typelabel="transcript"
    fromdb="1"
    />
  <featmap
    name="scaffold"
    types="golden_path_region"
    typelabel="scaffold"
    fromdb="1"
    />
  <featmap
    name="transposon"
    types="transposable_element"
    typelabel="transposable_element"
    />
  <featmap
    name="miscRNA"
    types="ncRNA snRNA snoRNA rRNA" 
    />
  <featmap
    name="gene_extended2000"
    types="gene"
    typelabel="gene_ex2000"
    subrange="-2000..2000"
    />
  <featmap
    name="intergenic"
    types="gene"
    typelabel="intergenic"
    method="between"
    />

  <rename_child_type>pseudogene|\w+RNA</rename_child_type>
  <mergematch></mergematch>

  <!-- remapType: append this name pattern to type -->
  <name2type_pattern></name2type_pattern>

  <!-- drop lengthy program.source from match name 
  match_sim4_na_EST_complete_dros RE37642.5prime-2L_wgs2cex-na_EST.complete.dros
  also fix these ugly names repeating feature types;
  match:sim4:na_STS.dros  BACN11E12-T7-211000022278175-na_STS.dros-sim4
    cuttype=1

  more cuttype names
  -211000022278591-aa_SP.real.dros-blastx
  match:blastx:aa_SPTR.dros       ID|AAL57609|SPTR|AAL57609-211000022278591-aa_SPTR.dros-blastx
  match:blastx:aa_TR.real.dros    Q967T2-211000022278591-aa_TR.real.dros-blastx
  match:blastx:aa_TR.real.dros    Q94885-211000022279519-aa_TR.real.dros-blastx
  match:blastx:aa_SPTR.dros       ID|AAL90081|SPTR|AAL90081-linked_7-aa_SPTR.dros-blastx  
  match:blastx:aa_SPTR.dros       ID|AAL48487|SPTR|AAL48487-AE003846R_extension-aa_SPTR.dros-blast
  match:groupest:na_DGC.dros      RH25653.3prime_revcomp-AABU01000160-na_DGC.dros-groupest

  perl -pi.old \
   -e's,\-AABU0\d+\S+dros\S+,,;'   
   -e's,\_extension\S+,,;' \
   -e's,\-linked_\S+,,;' \
   -e's,\-2110000\d+\S+,,;'   

  -->

  <mapname_pattern name="1match" type="^match.*" cuttype="1" from="null" to="null"/>
  <mapname_pattern name="2match" type="^match.*" from="\-?(2L|2R|3L|3R|4|X)[_\.].*$" to=""/>
  <mapname_pattern name="3match" type="^match.*" from="\_extension\S+$" to=""/>
  <mapname_pattern name="4match" type="^match.*" from="\-linked_\S+$" to=""/>
  <mapname_pattern name="5match" type="^match.*" from="\-2110000\d+\S+$" to=""/>

  <mapname_pattern name="cex" from="_wgs3_centromere_extension" to="_wgs2cex"/>
  <mapname_pattern name="dum" from="\-dummy\-" to=""/>
  <mapname_pattern name="tep" type="transposable_element_pred" 
    from="JOSHTRANSPOSON\-" to=""/>

<!--     <mapname_pattern name="psepred" type="^(gene|mRNA)"  -->
<!--       from="Contig[_\d]+" to=""/> -->

<!--   <maptype_pattern name="simwrap" from="sim4:wrap.*" to="sim4:wrap"/> -->
<!--   <maptype_pattern name="null" from="null" to="null"/> -->

<!--
## SONG/so Revision: 1.45
##     @is_a@oligo ; SO:0000696 ; SOFA:SOFA ; synonym:oligonucleotide
## 'so' is no longer valid
##   old value: @is_a@so ; SO:1000000
##  options are limited: located_sequence_feature, SO:0000110 ??
##  in flybase, 'so' seems used for protein blast matches?
## segment not in this    
## alt choices ...
#      @is_a@assembly ; SO:0000353 ; SOFA:SOFA
# **    @is_a@golden_path ; SO:0000688 ; SOFA:SOFA   <<
# **    @is_a@supercontig ; SO:0000148 ; SOFA:SOFA ; synonym:scaffold    <<
#     @is_a@tiling_path ; SO:0000472 ; SOFA:SOFA
#     @is_a@virtual_sequence ; SO:0000499 ; SOFA:SOFA
#     @is_a@chromosome ; SO:0000340
#     @part_of@chromosome_arm ; SO:0000105

## aug04: add new analysis features (HDP,RNAiHDP,fgenesh,)
## these are like exons but parent feature lacks featureloc 
## - need to join together by object_oid/parent_oid and compute parent feature (has name)
## SO type.subtype should be match.program
## SONG: match, match_part match_set nucleotide_match cross_genome_match cDNA_match EST_match

#? use '.' instead of '_' for part type? would that throw gnomap/gbrowse usage? probably
-->

  <!-- flybase chado has these with fmin == 1-origin, others are 0-origin; why?? -->
  <origin_one
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    />

  <topsort
    chromosome_arm="1"
    chromosome="1"
    />

  <segmentfeats 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    golden_path_region="1"
    golden_path="1"
    segment="1"
    source="1"
    />
  <!-- 
      ## segment no longer valid SO; supercontig or golden_path are best
  -->

  <!-- simplefeat == segmentfeats + others -->
  <simplefeat 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    gene="1"
    golden_path_region="1"
    golden_path="1"
    mature_peptide="1"
    oligonucleotide="1"
    point_mutation="1"
    pseudogene="1"
    region="1"
    repeat_region="1"
    segment="1"
    source="1"
    transcription_start_site="1"
    />

  <dropname 
    mRNA_genscan="1"
    mRNA_piecegenie="1"
    transcription_start_site="1"
    tRNA_trnascan="1"
    match_repeat_runner="1"
    match_repeat_runner_seg="1" 
    />

  <dropid 
    cDNA_clone="1"
    chromosome_band="1"
    EST="1"
    exon="1"
    intron="1"
    oligonucleotide="1"
    processed_transcript="1"
    repeat_region="1"
    transcription_start_site="1"
    transposable_element_pred="1"
    match_repeat_runner="1"
    match_repeat_runner_seg="1" 
    />

  <dropfeat_fff 
    CDS_exon="1"
    CDS="1"
    exon="1"
    five_prime_UTR="1"
    intron="1"
    remark="1"
    three_prime_UTR="1"
    match_repeat_runner="1"
    />

  <dropfeat_gff 
    CDS_exon="1"
    remark="1"
    />

  <!-- skipaskid & segmentfeats  ignored  parent_oid ; dont try to 
    make dubious, maybe huge compound feature -->
  <skipaskid 
    point_mutation="1"
    region="1"
    repeat_region="1"
    transcription_start_site="1"

    match_part_repeat_runner_seg="1"
    match_part_repeat_runner="1"
    match_part_promoter="1"
    />

  <hasdups 
    three_prime_UTR="1" 
    intron="1" 
    five_prime_UTR="1" 
    exon="1" 

    match_repeat_runner_seg="1"
    match_repeat_runner="1"
    />

    <!--   hasdups  match_repeat_runner="1"
    ? drop match_repeat_runner for match_repeat_runner_seg 
    match_repeat_runner="skip"
    -->

  <maptype 
    match_part_repeat_runner="skip"
    match_part_repeat_runner_seg="skip"
    match_part_promoter="skip"

    protein="CDS"
    CDS="CDS_exon"
    five_prime_untranslated_region="five_prime_UTR"
    golden_path_region="golden_path"
    match_fgenesh="match_fgenesh"
    match_HDP="match_HDP"
    match_RNAiHDP="match_RNAiHDP"
    mRNA_genscan="mRNA_genscan"
    mRNA_piecegenie="mRNA_piecegenie"
    mRNA_trnascan="tRNA_trnascan"
    oligonucleotide="oligo"
    three_prime_untranslated_region="three_prime_UTR"
    transposable_element_pred="transposable_element_pred"
    />

  <!--
  so => "located_sequence_feature", ## leave in for now; no replacement for so ; SO:1000000
  -->

  <maptype_gff>
    <!-- FIXME: ordered arrays here .. $type, $gffsource -->
    <!-- change to hash of hash : { fulltype => { gfftype => val, gffsource => val } } -->
    <match_part_fgenesh>match_part</match_part_fgenesh>
    <match_part_fgenesh>fgenesh</match_part_fgenesh>
    <match_part_RNAiHDP>match_part</match_part_RNAiHDP>
    <match_part_RNAiHDP>RNAiHDP</match_part_RNAiHDP>
    <mRNA_piecegenie>mRNA</mRNA_piecegenie>
    <mRNA_piecegenie>piecegenie</mRNA_piecegenie>
    <match_part_HDP>match_part</match_part_HDP>
    <match_part_HDP>HDP</match_part_HDP>
    <match_HDP>match</match_HDP>
    <match_HDP>HDP</match_HDP>
    <transposable_element_pred>transposable_element</transposable_element_pred>
    <transposable_element_pred>predicted</transposable_element_pred>
    <tRNA_trnascan>tRNA</tRNA_trnascan>
    <tRNA_trnascan>trnascan</tRNA_trnascan>
    <match_fgenesh>match</match_fgenesh>
    <match_fgenesh>fgenesh</match_fgenesh>
    <mRNA_genscan>mRNA</mRNA_genscan>
    <mRNA_genscan>genscan</mRNA_genscan>
    <match_RNAiHDP>match</match_RNAiHDP>
    <match_RNAiHDP>RNAiHDP</match_RNAiHDP>
  </maptype_gff>

</opt>

--- NEW FILE: tognomap.xml ---
<opt
  name="tognomap"
  date="20040821"
  >

  <title>Genome Feature merge (genomic and cytology features)</title>
  <about>
    These configs help merge two or more fff feature sets
    for use with gnomap,gbrowse(fb) and such feature file uses

    Add these to per-release-db configs.
    Cut from mergeflyfeats4.pl and .xml config, d.gilbert, aug04
  </about>

  <informat>fff</informat>  
  <informat>dna</informat>  

  <seqids
    path="tmp/chado-fb.ids"
    date="20040721" 
    noIDmap="cytowalk|protein|mRNA|CDS|EST|cDNA|oligo|processed|repeat|sim4"
    indexidtype="^(gene|pseudogene|\w+RNA)"
    indexidpattern="[A-Z]{2}gn\d+"
  />

  <seqfeat
    path="fff/[\w\-\_]+.fff"
    date="20040804"
    seqfeat="1"
    name="Annotation feature parts" 
    >
    <drop>chromosome_band</drop>
    <drop>remark</drop>
    <drop>source</drop>
  </seqfeat>

  <cytofeat
    path="cytomap/(cyto-features|cytofeat)-[\w\-\_].tsv"
    name="cyto-feature parts" 
    cytofeat="1"
    date="20040707" 
  />

  <sorsa
    path="cytomap/sorsa.txt"
    name="Table of genome:cytology mapping"
    date="20020817" 
  />

</opt>

--- NEW FILE: fbbulk-r4.xml ---
<opt
  name="fbbulk-r4" 
  relid="4"
  ROOT="${GMOD_ROOT}/"
  TMP="${GMOD_ROOT}/tmp"
  datadir="genomes/Drosophila_melanogaster"
>
<!--   datadir="data2/fban" -->

  <title>FlyBase Chado DB r4.0 test</title>
  <about>
   Configurations to extract feature and sequence data for
   bulk files from FlyBase chado databases, including various
   release information.
  </about>
  <doc name="README">
    D. melanogaster euchromatin genome data from FlyBase
    Release 4.0t.  See http://flybase.net/annot/dmel_4.txt
  </doc>

  <release id="4" 
    rel="r4.0t"  
    dbname="dmel_chado"  
    relfull="dmel_r4_0t_20040821"
    date="20040821" 
    release_url="/annot/release4.html"
    />
  <release id="3" 
    rel="r3.2.1" 
    relfull="dmel_r3.2.1_07212004"
    dbname="chado_r3_2_27" 
    date="20040804" 
    release_url="/annot/release3.2.1.html"
    />
  <release id="2" 
    rel="r3.2.0" 
    relfull="dmel_r3.2.0_03162004"
    dbname="chado_r3.2_18" 
    date="20040321" 
    release_url="/annot/release3.2.html"
    />
  <release id="1" 
    rel="r3.1.0" 
    relfull="dmel_r3.1.0_12182003"
    dbname="chado_r3.1" 
    date="20031218" 
    release_url="/annot/release3.1.html"
    />
  <release id="h3" 
    rel="r3_2h"  
    dbname="chadohet_081604" 
    relfull="dmel_hetr3_2_08162004"
    date="20040821" 
    release_url="/annot/het-release3.2.html"
    />
  <release id="p1" 
    rel="dpse_r1.0"  
    relfull="dpse_r1_0_20040821"
    dbname="dmelr3_2_dpser1_0" 
    date="20040821" 
    release_url="/annot/dpse-release1.html"
    />

  <!-- db.name  is release-dependent ; use above -->
  <db
    driver="Pg"
    name="dmel_chado"
    host="bugbane.bio.indiana.edu"
    port="7302"
    user="flybase"
    password=""
    />

  <org>dmel</org>
  <species>Drosophila melanogaster</species>

  <!-- get chromosomes from featdump chromosomes -->
  <chromosomes>X</chromosomes>
  <chromosomes>2L</chromosomes>
  <chromosomes>2R</chromosomes>
  <chromosomes>3L</chromosomes>
  <chromosomes>3R</chromosomes>
  <chromosomes>4</chromosomes>

  <!-- dnadump FIXME; need to extract scaffold dna from chado db -->
  <dnadump
    path="dna/[\w\-\_]+.raw"
    sql="select feature_id, residues from feature where uniquename = ?"
    />

  <featdump
    path="tmp/featdump/\w+.tsv"
    config="chadofeatsql"
    tag="feature_sql"
    type="feature_table"
    splitname="chadofeat"
    >
<!--     <target>fbids</target> // accessory table ; fixme -->
    <target>chromosomes</target>
    <target>features</target>
    <target>matches</target>
    <target>analysis</target>

    <!-- ? use this for sql arguments ? -->
    <!-- <target name="chromosomes" arg1="chromosome,chromosome_arm"/> -->
  </featdump>

<!--   <dnafiles -->
<!--     path="dna/[\w\-\_]+.raw" -->
<!--     name="Chromosome dna"  -->
<!--   /> -->
  <fileset
    name="dna"
    path="dna/[\w\-\_]+.raw"
    sql="select feature_id, residues from feature where uniquename = ?"
    title="Chromosome dna"
    />

<!--   <featfiles -->
<!--     path="fff/[\w\-\_]+.fff" -->
<!--     name="Genome features"  -->
<!--     config="chadofeatconv" -->
<!--   /> -->
  <fileset
    name="fff"
    path="fff/[\w\-\_]+.fff"
    title="FFF Genome features"
    config="chadofeatconv"
    handler="FeatureWriter"
    dogzip="1"
    />
  <fileset
    name="gff"
    path="gff/[\w\-\_]+.gff"
    title="GFF Genome features"
    dogzip="1"
    />

  <fileset
    name="gnomap"
    path="gnomap/[\w\-\_]+.tsv"
    config="tognomap"
    handler="GnomapWriter"
    title="GnoMap features"
    indexonly="0"
    />
  <fileset
    name="gbrowse"
    path="gnomap/gbrowse.conf"
    config="gbrowseconf" 
    handler="GnomapWriter"
    withvars="1"
    />

<!--   <fastafiles -->
<!--     path="fasta/[\w\-\_]+.fasta" -->
<!--     name="Genome feature sequence fasta"  -->
<!--     dropnotes="synonym_2nd,synonym" -->
<!--     dogzip="1" -->
<!--   /> -->
<!--   <blastfiles -->
<!--     path="blast/[\w\-\_]+.*" -->
<!--     name="Blast indices"  -->
<!--     config="blastfiles" -->
<!--   /> -->
  <fileset
    name="fasta"
    path="fasta/[\w\-\_]+.fasta"
    config="tofasta"
    handler="FastaWriter"
    dropnotes="synonym_2nd,synonym"
    makeall="1"
    perchr="1"
    dogzip="1"
    title="Genome feature sequence fasta" 
    />
  <fileset
    name="blast"
    path="blast/[\w\-\_]+.*"
    config="blastfiles"
    handler="BlastWriter"
    title="Blast indices" 
    />

  <!-- use to add parent ids to features FBgn|FBti ; NOT these - FBan|CG|CR -->
  <idpattern>(FBgn|FBti)\d+</idpattern>

  <!-- FIXME  
    new parse steps put all individual feats to fasta ; 
    change back to use featset info
  -->

<featset>gene</featset>
  <featset>mRNA</featset>
  <featset>CDS</featset>
  <featset>transcript</featset>
  <featset>translation</featset>
  <featset>tRNA</featset>
  <featset>miscRNA</featset>
  <featset>transposon</featset>
  <featset>pseudogene</featset>
  <featset>gene_extended2000</featset>
  <featset>five_prime_UTR</featset>
  <featset>three_prime_UTR</featset>
  <featset>intron</featset>
  <featset>intergenic</featset>
  <featset>scaffold</featset>

  <!-- featmap moved to common bulkfiles/chadofeatconv
    but can override here if desired to add/replace/delete -->

  <featmap
    name="translation"
    types="CDS"
    typelabel="protein"
    fromdb="1"
    />
  <featmap
    name="miscRNA"
    types="ncRNA snRNA snoRNA rRNA" 
    />
  <featmap
    name="gene_extended2000"
    types="gene"
    typelabel="gene_ex2000"
    subrange="-2000..2000"
    />
  <featmap
    name="intergenic"
    types="gene"
    typelabel="intergenic"
    method="between"
    />

</opt>

--- NEW FILE: chadofeatsql.xml ---
<opt
  name="chadofeatsql" 
  date="20040821"
  >

  <title>FlyBase Chado DB SQL</title>

  <about>
    This is a collection of Chado DB SQL calls to extract
    all basic genome features, into intermediate feature_table form

  arm     fmin    fmax    strand  type    name    id      oid     attr_type       attribute
  2L      0       305900  1       golden_path_region      AE003590        AE003590        1273141 dbxref   Gadfly:AE003590
  2L      6364    6366    1       transcription_start_site        6365-6366-AE999999.Fake-dummy-promoter   6365-6366-AE999999.Fake-dummy-promoter  1273564         
  2L      6773    6808    1       exon            NULL:960558     1273721 parent_oid      1273720:1
  2L      6773    9276            mRNA    6773,9276-AE999999.Fake-dummy-piecegenie        NULL:9605

    There is a tag feature_sql, type="feature_table" for each group of
    features needing separate SQL: chromosome/super-contigs,
    gene_models, matches, analyses, syntenic features ..

    Logic in Bulkfiles::FeatureWriter then merges, selects/reformats
    these feature_tables and writes bulk files per chromosome.  One can
    also attach post-processing scripts (see matches below).
  </about>

  <feature_sql name="fbids" type="list" output="chado-fb.ids">
  <sql>
    -- used for flybase to exclude cytologic features with same genome feature id
    SELECT accession FROM dbxref 
    WHERE accession like '%FBgn%' or accession like '%FBti%';
  </sql>
  </feature_sql>

  <feature_sql name="chromosomes" type="feature_table" output="chromosomes.tsv">
  <sql>
-- get chromosome/arm/supercontig/... lengths/names
-- add get-residues to file 
SELECT 
  CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, 
  '1' as fmin, armft.seqlen as fmax, '0' as strand,
  featcv.name as type,
  armft.name as name,
  armft.uniquename as id,
  armft.feature_id as oid,
  'species' as attr_type, 
  org.genus || '_' || org.species AS attribute
FROM 
  feature armft, organism org, cvterm featcv
WHERE 
  featcv.name in ( 'chromosome', 'chromosome_arm')  --  need more choices 
  and armft.organism_id = org.organism_id
  and armft.type_id = featcv.cvterm_id
ORDER BY arm ;
  </sql>

  </feature_sql>

  <feature_sql name="features" type="feature_table" output="features.tsv">
  <sql>
-- standard features (excluding matches, analyses)
SELECT  distinct
  CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, 
  armloc.fmin, armloc.fmax, armloc.strand,
  targcv.name as type,
  targft.name as name,
  targft.uniquename as id,
  targft.feature_id as oid,
  attr.type as attr_type, attr.attribute

FROM
  feature armft, 
  feature targft left outer join gffattr_gmodel attr
    on (targft.feature_id = attr.feature_id),
  featureloc armloc, 
  cvterm targcv

WHERE
  NOT (targcv.name in ('match')) --  add orthologous_region, syntenic_region
  and targft.type_id = targcv.cvterm_id 
  and armft.feature_id = armloc.srcfeature_id  
  and targft.feature_id = armloc.feature_id  

ORDER BY
   arm, armloc.fmin, targcv.name
;
  </sql>
  </feature_sql>

  <feature_sql name="matches" type="feature_table"  output="matches.tsv">
  <sql>
-- use this to get the paired genome/target match items w/ attr (or null) --
-- jun04 - need to filter out apollo dupl. evidence for
-- match features, type = transposable_element_insertion_site
SELECT distinct
  CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, 
  armloc.fmin, armloc.fmax, armloc.strand,
  targcv.name as type, 
  targft.name as name,
  targft.uniquename as id,
  targft.feature_id as oid,
  attr.type as attr_type, attr.attribute

FROM
  feature matchft, feature armft, 
  feature targft left outer join gffattr_match attr
     on (targft.feature_id = attr.feature_id),
  featureloc armloc, featureloc targloc,
  cvterm targcv, cvterm matchcv, cvterm armcv

   -- tested speed of this w/ cv.names versus cvterm_id number inserted
   -- pg is smart enough to optimize cv name lookup once - leave as is
WHERE
  matchcv.name = 'match' 
  and matchcv.cvterm_id = matchft.type_id 
  and armcv.name in ( 'chromosome', 'chromosome_arm')
  and armcv.cvterm_id = armft.type_id  

  and targft.type_id = targcv.cvterm_id 
  and targft.feature_id != armft.feature_id  

  -- separate paired features here - keep chr-arm loc, target types/attr
  and matchft.feature_id = armloc.feature_id  
  and armft.feature_id = armloc.srcfeature_id

  and matchft.feature_id = targloc.feature_id  
  and targft.feature_id = targloc.srcfeature_id 

ORDER BY
   arm, armloc.fmin, targcv.name
;
  </sql>
  </feature_sql>

  <feature_sql name="analysis" type="feature_table" output="analysis.tsv">

<target name="main" depends="query,postprocess"/> 
  <target name="query" action="sql" /> 
  <target name="postprocess" action="rdump" description="fix parents w/o featureloc from kid values" />

  <!--  this works: perl -i.old rdump $r/tmp/featdump/analysis.tsv -->
  <script name="rdump" type="postprocess" shell="perl -i" language="perl">
<![CDATA[
while(<>) { 
  my @v= split/[\t]/; $d= $v[-1]; 
  if ($v[0] eq "arm") { print join("\t",@v); next; } 
  rdump() if ($ld && $d ne $ld);  
  push(@r,\@v); $ld= $d; 
  rdump() if (eof);
  } 

sub rdump() {
  $p=$r[0]; $k= $r[1]; ($a,$b,$e,$s)= @{$k}[0..3];
  foreach $k (@r[2..$#r]) {
    $kb=$$k[1]; $ke=$$k[2]; $b=$kb if ($kb<$b); $e=$ke if ($ke>$e); }
  $$p[0]=$a; $$p[1]=$b; $$p[2]=$e; $$p[3]=$s; pop(@$p); $$p[-1]="\n"; $r[0]=$p;
  foreach $i (1..$#r) { $r[$i]->[4] =~ s/^match_/match_part_/; }
  foreach $r (@r) { print join("\t",@$r); } @r=(); 
  } 

]]>
  </script>

  <!-- want sql tag at top level of feature_sql like others -->
  <sql>
-- select all an features of right type, with arm featureloc when avail
-- add analysis.sourcename where needed (ignore 'dummy');
-- change 'match_' leading type ; drop or make option
SELECT 
  armft.arm, armft.fmin, armft.fmax, armft.strand, 
  -- 'match_' || an.program as type,  
  'match_' || an.program || an.sourcename as type,  
  targft.name as name,  
  targft.uniquename as id, 
  targft.feature_id as oid,
  CASE WHEN attr.type IS NULL THEN text('object_oid')  ELSE attr.type  END 
  as attr_type,
  CASE WHEN attr.attribute IS NULL THEN text(targft.feature_id) ELSE attr.attribute  END 
  as attribute
FROM 
  feature targft left outer join gffatts_evid attr
    on (targft.feature_id = attr.feature_id),
  analysisfeature anf left outer join gffatts_anfloc armft
    on (anf.feature_id = armft.feature_id),
  analysis an
WHERE
  an.program in ('HDP','RNAiHDP','fgenesh') -- FIXME add params here
  and an.analysis_id = anf.analysis_id
  and anf.feature_id = targft.feature_id
;
  </sql>
  </feature_sql>

  <feature_sql name="views" type="view">
  <sql>
-- from sequence-gff-views.sql 
-- dont need all of these attribs;
--   dbxref: yes, cvterm: no, synonym: no?, pub: no, 
--   featureprop - want some: cyto_range, gbunit?(no)
-- add dbxref_2nd, aug04 for dbxref need to know primary/secondary > feature_dbxref.is_current !

-- attr view for regular features which may have parent features (exons)
CREATE OR REPLACE VIEW gffattr_gmodel (
    feature_id, type, attribute
) AS
  SELECT feature_id,  
    CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, 
    d.name || ':' || s.accession AS attribute
  FROM dbxref s, feature_dbxref fs, db d
  WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id

UNION ALL
  SELECT feature_id, cv.name AS type, fp.value AS attribute
  FROM featureprop fp, cvterm cv
  WHERE fp.type_id = cv.cvterm_id
    and (cv.name = 'cyto_range' or cv.name = 'gbunit')
    -- keep this restriction - other props not useful here: comments, sp_comment, owner ...

UNION ALL
  SELECT feature_id,
    CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, 
    s.synonym_sgml AS attribute
  FROM feature_synonym fs, synonym s
  WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE

UNION ALL
  -- add parent feat ids for exons, etc.
  SELECT pk.subject_id AS feature_id, 'parent_oid' AS type, 
    CASE  
      WHEN pk.rank IS NULL THEN text(pk.object_id)
      ELSE pk.object_id || ':' || pk.rank
    END 
  FROM feature_relationship pk
;
GRANT SELECT ON gffattr_gmodel TO PUBLIC;

-- attr view for match features
CREATE OR REPLACE VIEW gffattr_match (
    feature_id,  type,  attribute
) AS
  SELECT feature_id,  
    CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, 
    d.name || ':' || s.accession AS attribute
  FROM dbxref s, feature_dbxref fs, db d
  WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id

UNION ALL
  SELECT feature_id, cv.name AS type, fp.value AS attribute
  FROM featureprop fp, cvterm cv
  WHERE fp.type_id = cv.cvterm_id
    and (cv.name = 'cyto_range' or cv.name = 'gbunit')

UNION ALL
  SELECT feature_id,
    CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, 
    s.synonym_sgml AS attribute
  FROM feature_synonym fs, synonym s
  WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE
;
GRANT SELECT ON gffattr_match TO PUBLIC;

-- attrib view for cross-species feats (synteny, orthology)
CREATE OR REPLACE VIEW gffattr_synteny (
    feature_id, type,  attribute
) AS
  --  parent feat ids for source supercontigs, etc.
  SELECT pk.subject_id, 'parent_oid' as type, 
    CASE  
      WHEN pk.rank IS NULL THEN text(pk.object_id)
      ELSE pk.object_id || ':' || pk.rank
    END 
  FROM feature_relationship pk
;
GRANT SELECT ON gffattr_synteny TO PUBLIC;

-- for analysis features
CREATE OR REPLACE VIEW gffatts_anfloc (
    feature_id, arm, fmin, fmax, strand
  ) AS
  SELECT 
    armloc.feature_id,  
    CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm,
    armloc.fmin, armloc.fmax, armloc.strand
  FROM  feature armft, featureloc armloc, cvterm armcv
  WHERE 
    armft.type_id = armcv.cvterm_id
    and armcv.name in ( 'chromosome', 'chromosome_arm')
    and armft.feature_id = armloc.srcfeature_id
;
GRANT SELECT ON gffatts_anfloc TO PUBLIC;

-- for analysis features
CREATE OR REPLACE VIEW gffatts_evid (
    feature_id, type, attribute
) AS
  SELECT pk.subject_id, text('parent_oid'), text(pk.object_id)
  FROM feature_relationship pk
;
GRANT SELECT ON gffatts_evid TO PUBLIC;

  </sql>
  </feature_sql>
</opt>

--- NEW FILE: chadofeatconv.xml ---
<opt
  name="chadofeatconv"
  date="20040821"
  >

  <title>Chado DB Feature info</title>
  <about>
    These are configurations for converting chado feature table dumps to
    standard feature/sequence files. Much of below specifies how to
    process different features (tied to methods in ChadoFeatDump.pm

    These configs should be data-set independent.
    This works with, but is independent of SeqUtil2 configs.
  </about>

  <informat>feature_table</informat>  

  <outformats>fff</outformats>
  <outformats>gff</outformats>
  <outformats>fasta</outformats>

  <!-- copied from db-release config files .. this is mostly common info,
    but db-release config can override @featset, %featmap -->

  <!-- feature sets to make fasta bulk files -->
  <featset>gene</featset>
  <featset>mRNA</featset>
  <featset>CDS</featset>
  <featset>transcript</featset>
  <featset>translation</featset>
  <featset>tRNA</featset>
  <featset>miscRNA</featset>
  <featset>transposon</featset>
  <featset>pseudogene</featset>
  <featset>gene_extended2000</featset>
  <featset>five_prime_UTR</featset>
  <featset>three_prime_UTR</featset>
  <featset>intron</featset>
  <featset>intergenic</featset>
  <featset>scaffold</featset>

  <!-- 
   featmap for feature sets that need reprocessing, attributes: 
    name = feature set name
    types = feature types, space delimited (in fff/gff)
    typelabel = type to use in output header, paired with types list
    subrange = expansion/extraction range to add to feature location;
             readseq syntax not yet supported, i.e. {start,end}+/-offset)
    fromdb = extract from chado database feature.residues field
             rather than chromosome dna file,
             for curated residues - transcript, translation)
    get_id = save id, dbxref fields for add to other (genemodel) features
    add_id = add id of parent feature (genemodel)
    note: translation type=protein is coded into various seq fetch programs
  -->

  <featmap name="gene" get_id="1" />
  <featmap name="five_prime_UTR" add_id="gene" />
  <featmap name="three_prime_UTR" add_id="gene" />
  <featmap name="intron" add_id="gene" />
  <featmap name="mRNA" add_id="gene" />
  <featmap name="CDS" add_id="gene" />

  <featmap
    name="translation"
    types="CDS"
    typelabel="protein"
    fromdb="1"
    />
  <featmap
    name="transcript"
    types="mRNA"
    typelabel="transcript"
    fromdb="1"
    />
  <featmap
    name="scaffold"
    types="golden_path_region"
    typelabel="scaffold"
    fromdb="1"
    />
  <featmap
    name="transposon"
    types="transposable_element"
    typelabel="transposable_element"
    />
  <featmap
    name="miscRNA"
    types="ncRNA snRNA snoRNA rRNA" 
    />
  <featmap
    name="gene_extended2000"
    types="gene"
    typelabel="gene_ex2000"
    subrange="-2000..2000"
    />
  <featmap
    name="intergenic"
    types="gene"
    typelabel="intergenic"
    method="between"
    />

  <rename_child_type>pseudogene|\w+RNA</rename_child_type>
  <mergematch></mergematch>

<!--
## SONG/so Revision: 1.45
##     @is_a@oligo ; SO:0000696 ; SOFA:SOFA ; synonym:oligonucleotide
## 'so' is no longer valid
##   old value: @is_a@so ; SO:1000000
##  options are limited: located_sequence_feature, SO:0000110 ??
##  in flybase, 'so' seems used for protein blast matches?
## segment not in this    
## alt choices ...
#      @is_a@assembly ; SO:0000353 ; SOFA:SOFA
# **    @is_a@golden_path ; SO:0000688 ; SOFA:SOFA   <<
# **    @is_a@supercontig ; SO:0000148 ; SOFA:SOFA ; synonym:scaffold    <<
#     @is_a@tiling_path ; SO:0000472 ; SOFA:SOFA
#     @is_a@virtual_sequence ; SO:0000499 ; SOFA:SOFA
#     @is_a@chromosome ; SO:0000340
#     @part_of@chromosome_arm ; SO:0000105

## aug04: add new analysis features (HDP,RNAiHDP,fgenesh,)
## these are like exons but parent feature lacks featureloc 
## - need to join together by object_oid/parent_oid and compute parent feature (has name)
## SO type.subtype should be match.program
## SONG: match, match_part match_set nucleotide_match cross_genome_match cDNA_match EST_match

#? use '.' instead of '_' for part type? would that throw gnomap/gbrowse usage? probably
-->

  <!-- remapType: append this name pattern to type -->
  <name2type_pattern>[-_](genscan|piecegenie|twinscan|genewise|pred|trnascan)</name2type_pattern>

  <!-- flybase chado has these with fmin == 1-origin, others are 0-origin; why?? -->
  <origin_one
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    />

  <topsort
    chromosome_arm="1"
    chromosome="1"
    />

  <segmentfeats 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    golden_path_region="1"
    golden_path="1"
    segment="1"
    source="1"
    />
  <!-- 
      ## segment no longer valid SO; supercontig or golden_path are best
  -->

  <!-- simplefeat == segmentfeats + others -->
  <simplefeat 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    gene="1"
    golden_path_region="1"
    golden_path="1"
    mature_peptide="1"
    oligonucleotide="1"
    point_mutation="1"
    pseudogene="1"
    region="1"
    repeat_region="1"
    segment="1"
    source="1"
    transcription_start_site="1"
    />

  <dropname 
    mRNA_genscan="1"
    mRNA_piecegenie="1"
    transcription_start_site="1"
    tRNA_trnascan="1"
    />

  <dropid 
    cDNA_clone="1"
    chromosome_band="1"
    EST="1"
    exon="1"
    intron="1"
    oligonucleotide="1"
    processed_transcript="1"
    repeat_region="1"
    transcription_start_site="1"
    transposable_element_pred="1"
    />

  <dropfeat_fff 
    CDS_exon="1"
    CDS="1"
    exon="1"
    five_prime_UTR="1"
    intron="1"
    remark="1"
    three_prime_UTR="1"
    />

  <dropfeat_gff 
    CDS_exon="1"
    remark="1"
    />

  <skipaskid 
    point_mutation="1"
    region="1"
    repeat_region="1"
    transcription_start_site="1"
    />

  <hasdups 
    three_prime_UTR="1" 
    intron="1" 
    five_prime_UTR="1" 
    exon="1" 
    />

  <maptype 
    protein="CDS"
    CDS="CDS_exon"
    five_prime_untranslated_region="five_prime_UTR"
    golden_path_region="golden_path"
    match_fgenesh="match_fgenesh"
    match_HDP="match_HDP"
    match_RNAiHDP="match_RNAiHDP"
    mRNA_genscan="mRNA_genscan"
    mRNA_piecegenie="mRNA_piecegenie"
    mRNA_trnascan="tRNA_trnascan"
    oligonucleotide="oligo"
    three_prime_untranslated_region="three_prime_UTR"
    transposable_element_pred="transposable_element_pred"
    />
  <!--
  so => "located_sequence_feature", ## leave in for now; no replacement for so ; SO:1000000
  -->

    <mapname_pattern name="null" from="null" to="null"/>
    <mapname_pattern name="dum" from="\-dummy\-" to=""/>
    <mapname_pattern name="tep" type="transposable_element_pred" 
      from="JOSHTRANSPOSON\-" to=""/>

<!--   <maptype_pattern name="null" from="null" to="null"/> -->
<!--   <maptype_pattern name="simwrap" from="sim4:wrap.*" to="sim4:wrap"/> -->

  <maptype_gff>
    <!-- FIXME: ordered arrays here .. $type, $gffsource -->
    <!-- change to hash of hash : { fulltype => { gfftype => val, gffsource => val } } -->
    <match_part_fgenesh>match_part</match_part_fgenesh>
    <match_part_fgenesh>fgenesh</match_part_fgenesh>
    <match_part_RNAiHDP>match_part</match_part_RNAiHDP>
    <match_part_RNAiHDP>RNAiHDP</match_part_RNAiHDP>
    <mRNA_piecegenie>mRNA</mRNA_piecegenie>
    <mRNA_piecegenie>piecegenie</mRNA_piecegenie>
    <match_part_HDP>match_part</match_part_HDP>
    <match_part_HDP>HDP</match_part_HDP>
    <match_HDP>match</match_HDP>
    <match_HDP>HDP</match_HDP>
    <transposable_element_pred>transposable_element</transposable_element_pred>
    <transposable_element_pred>predicted</transposable_element_pred>
    <tRNA_trnascan>tRNA</tRNA_trnascan>
    <tRNA_trnascan>trnascan</tRNA_trnascan>
    <match_fgenesh>match</match_fgenesh>
    <match_fgenesh>fgenesh</match_fgenesh>
    <mRNA_genscan>mRNA</mRNA_genscan>
    <mRNA_genscan>genscan</mRNA_genscan>
    <match_RNAiHDP>match</match_RNAiHDP>
    <match_RNAiHDP>RNAiHDP</match_RNAiHDP>
  </maptype_gff>

</opt>
--- NEW FILE: blastfiles.xml ---
<opt 
  name="blastfiles"
  date="20040821"

  blasthome="${ARGOS_ROOT}/common/servers/blast/Bin" 
  formatdb="${ARGOS_ROOT}/common/servers/blast/Bin/formatdb" 
  formatdbopts="-o F " 
  isprot_patt="(translation|aa_)"
  >

  <title>Blast index writer</title>
  <about>
    These are configurations for updating blast indices,
    rc files, html pages, given genome fasta input files
    and fasta/feature groups.
  </about>

  <informat>fasta</informat>  
  <outformats>ncbi</outformats>
  <outformats> </outformats>

  <!-- feature sets to make blast indices -->
  <blastset>gene</blastset>
  <blastset>transcript</blastset>
  <blastset>translation</blastset>
  <blastset>tRNA</blastset>
  <blastset>miscRNA</blastset>
<!--   <blastset>transposon</blastset> -->
  <blastset>pseudogene</blastset>
  <blastset>intergenic</blastset>
  <blastset>scaffold</blastset>
  <blastset>chromosome</blastset>

  <blastmap
    name="miscRNA"
    types="ncRNA snRNA snoRNA rRNA" 
    />

  <doc name="dbrc" path="blast/blast.rc"><![CDATA[
# blast.rc 
# This is dynamic configuration file for NCBI WWW BLAST service
#
# Number of CPUs to use for a single request
#
NumCpuToUse     1
#
# Here are list of combinations program/database, 
# that allowed by BLAST service. Format: <program> <db> <db> ...
#
blastn  
blastp 
blastx 
tblastn 
tblastx 
  ]]></doc>

  <!-- also need to write some .nal, .pal ncbi blast files  to join other dbs 
    .. put tag into individual ones to join, eg. nalfile="na_all na_est"
  -->

<doc name="dbselect" path="blast/blastdb.in">
  
  
  <header title=""><![CDATA[ ]]></header>
  <footer title=""><![CDATA[ ]]></footer>
  </doc>

  <doc name="dbtable" path="blast/blast_databases.html">
  <!-- no content tag here -->
  <!-- optional <tableheader> four fields are dbname, dbfile, update-date, description -->
  <header title=""><![CDATA[<HTML>
  <HEAD>
  <TITLE>${species} BLAST Databases</TITLE> 
  </HEAD>

  <BODY>  
  <h1>
  <I>${species}</I> Genome BLAST
  </h1>
  <h2>
  Data release <a href="${release_url}">${rel}</a>
  <br>Data sets for BLAST search
  </h2>
  <HR>
  <B><LI>Database</B> - Choose a database &nbsp &nbsp <FONT COLOR="red"><B>required</B></FONT>
  <BR><BR>
  <TABLE BORDER=1 cellpadding="5" cellspacing="1">
  ]]></header>

  <footer title=""><![CDATA[
  <TR> 
  <TD colspan=4 align=left> <a name=key> <B>key: AA = protein, NT = nucleotide. 
  To download databases, visit the  <A HREF="/annot/" target=_self>Sequence 
  Download Page</A></B> </TD>
  </TR>
  </TABLE>
  </BODY>
  </HTML>
  ]]></footer> 
  </doc>

  <db name="na_all" title="  All ${species} sequences (NT)"><![CDATA[
    Euchromatin,  heterochromatin, and predicted genes; 
    genomic clones, ESTs, STSs, P-element 
    insertion sites, and public sequences from GenBank (see below)
  ]]>
  </db> 

    <!-- these are generic for any organism -->
  <db name="chromosome" title="  All genome chromosome arms (NT)"
      nalfile="na_all"
  ><![CDATA[  
    Whole genome by chromosome arms
  ]]>
  </db> 

  <db name="transcript" title="  Predicted genes (NT)"
    nalfile="na_all"
    ><![CDATA[
    Complete transcript (CDS + UTR) for all of the genes 
   ]]>
  </db> 

  <db name="translation" title="  Predicted proteins (AA)"><![CDATA[
    Peptide translations for all of the genes 
  ]]>
  </db> 

  <!-- fixme; need to extract scaffold dna from chado db -->
  <db name="scaffold" title=" Euchromatin and heterochromatin scaffolds (NT)"
    nalfile="na_all"
    ><![CDATA[
    Genomic sequence for the euchromatic chromosome arms, 
     and from the heterochromatin assembly,
     divided into ~350kb GenBank scaffolds 
  ]]>
  </db> 

  <db name="intergenic" title="Intergenic sequence (NT)"
    nalfile="na_all"
    ><![CDATA[
  Intergeneic regions -- genome sequence between genes  
  ]]>
  </db> 

 <db name="transposon" skip="1" title="Transposon insertions (NT)"><![CDATA[
  Transposon insertion sites in genome 
  ]]>
  </db> 

  <!-- these are flybase specific -->

  <db name="na_cDNA" title=" BDGP full-length cDNAs (NT)"
    nalfile="na_all" org="dmel"
    ><![CDATA[
  <I>${species}</I> full-length cDNAs from the BDGP
  ]]>
  </db> 

  <db name="na_geno_clones" title=" BDGP/EDGP genomic clones (NT)"
    nalfile="na_all"  org="dmel"
    ><![CDATA[
  P1, BAC, and cosmid sequences from the BDGP & EDGP that have been submitted 
        to GenBank -- this does not include whole genome shotgun sequence 
  ]]>
  </db> 

<!--   <db name="dmel_hetero_scaffolds" title=" Heterochromatin scaffolds (NT)"><![CDATA[ -->
<!--   Genomic sequence from <a href="http://www.celera.com" target="_self">Celera</a>  -->
<!--         for the heterochromatin from the WGS3 assembly -->
<!--   ]]> -->
<!--   </db>  -->

  <db name="dmel_aa_swall" title=" SwissProt and TREMBL proteins (SWALL) (AA)"><![CDATA[
  A non-redundant combination of the <i>${species}</i> entries from the 
        SwissProt+TrEMBL+TrEMBLNew databases (SWALL)
  ]]>
  </db> 

  <db name="dmel_aa_uniprot" title=" UniProt proteins (AA)"><![CDATA[
  <i>${species}</i> proteins from UniProt
  ]]>
  </db> 

  <db name="dmel_aa_refseq" title=" RefSeq proteins (AA)"><![CDATA[
  <i>${species}</i> proteins from  RefSeq
  ]]>
  </db> 

  <db name="dmel_na_refseq" title=" RefSeq Sequences (NT)"><![CDATA[
  <i>${species}</i> nucleic sequences from  RefSeq
  ]]>
  </db>

  <db name="na_gb" title=" GenBank (without BDGP, EDGP, Celera or dbEST) (NT)"
      nalfile="na_all"
  ><![CDATA[
   <i>${species}</i> sequences collected from public sequence databases 
        (but without BDGP, EDGP, Celera, or dbEST sequences)
  ]]>
  </db> 

  <db name="na_est" title="ESTs from BDGP and dbEST cDNAs (NT)" org="dmel" ><![CDATA[
   <I>Drosophila</I> Expressed Sequence Tags from the BDGP and from dbEST 
        cDNAs (redundant)
  ]]>
  </db> 

  <db name="na_EST" part_of="na_est" title="" skip="1"
    nalfile="na_all na_est"
  ><![CDATA[
   Expressed Sequence Tags
  ]]>
  </db> 
  <db name="na_dbEST" part_of="na_est" title="" skip="1"
    nalfile="na_all na_est"
    ><![CDATA[
  dbEST  cDNAs
  ]]>
  </db> 

  <db name="na_pe" title=" P element insertion sites (NT)" nalfile="na_all" org="dmel"><![CDATA[
  Genomic sequence flanking BDGP P-element insertions (BDGP)
  ]]>
  </db> 

  <db name="na_re" title=" Repeats (NT)"  nalfile="na_all" ><![CDATA[
  a curated set of <i>${species}</i> known repeats
  ]]>
  </db> 

  <db name="na_STS" title=" STSs (NT)"  nalfile="na_all"><![CDATA[
   <I>Drosophila</I> STSs from the BDGP and EDGP mapping projects 
  ]]>
  </db> 

 <db name="na_te" title=" Transposons (NT)" nalfile="na_all"><![CDATA[
  a curated set of <i>${species}</i> transposable elements 
  ]]>
  </db> 

</opt>

--- NEW FILE: sgdbulk1.xml ---
<opt
  name="sgdbulk1"
  relid="1"
  ROOT="${GMOD_ROOT}/"
  TMP="${GMOD_ROOT}/tmp"
  datadir="genomes/Saccharomyces_cerevisiae"
  >

  <title>SGD Lite rel 1</title>

  <doc name="README"><![CDATA[
  Genome file output for Chado genome databases
  Database: SGD Lite rel 1
  Software: Bio::GMOD::Bulkfiles

  Chado genome databases available (aug 2004) are
  ftp://flybase.net/genomes/Drosophila_melanogaster/current/pgsql/chado_r3_2_26_s.gz
  http://sgdlite.princeton.edu/download/sgdlite/2004_05_19_sgdlite.sql.gz

  QUICK TEST:
  # get soft
  cvs -d :pserver:ano...@cv...:/cvsroot/gmod \
    co -d GMODTools schema/GMODTools 

  # load chado db to Postgres
  createdb sgdlite_20040519
  wget http://sgdlite.princeton.edu/download/sgdlite/2004_05_19_sgdlite.sql.gz
  (zcat *sgdlite.sql.gz | psql -d sgdlite_20040519 -f - ) >& log.load 

  # set root path to here and make bulkfiles
  env GMOD_ROOT=$PWD  ARGOS_ROOT=$PWD \
   perl -I./GMODTools/lib/ GMODTools/bin/bulkfiles.pl sgdbulk1
  ]]></doc>

  <doc name="Test-results.txt"><![CDATA[
  Test for GMOD::Bulkfile processor
  aug 04, d.gilbert 

  dghome2% ls $sgr
  /bio/biodb/flybase/data2/fban/sgdlite_20040519:
  chadofeat-summary.txt  fasta/                 gff/                   tmp/
  dna/                   fff/                   gnomap/                

  dghome2% du $sgr
  24052   /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna
  9928    /bio/biodb/flybase/data2/fban/sgdlite_20040519/fasta
  944     /bio/biodb/flybase/data2/fban/sgdlite_20040519/fff
  924     /bio/biodb/flybase/data2/fban/sgdlite_20040519/gff
  0       /bio/biodb/flybase/data2/fban/sgdlite_20040519/gnomap
  3980    /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump
  3980    /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp
  39836   /bio/biodb/flybase/data2/fban/sgdlite_20040519

  # Summary of features for Saccharomyces cerevisiae from SGD Chado DB Lite r1  [20040519]
  # ALL chromosomes
  ARS     59     
  CDS     7050
  centromere      16
  chromosome      17
  gene    13645
  long_terminal_repeat    382
  ncRNA   14
  non_transcribed_region  3
  noncoding_exon  466
  rRNA    56
  region  78
  repeat_family   110
  retrotransposon 50
  snRNA   14
  snoRNA  135
  tRNA    347
  telomere        32
  transcribed_spacer_region       16
  transposable_element_gene       100

  == process log ==
  DBI->connect( dbi:Pg:dbname=sgdlite_20040519;host=localhost;port=7302 )
  do sql  views view
  sql dump chromosomes feature_table /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump/
  chromosomes.tsv
  sql dump chromosomes n rows=17
  sql dump features feature_table /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump/fea
  tures.tsv
  sql dump features n rows=22573
  sortNSplitByChromosome:

  dumpChromosomeBases /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna/scer_chrI_dna_sgdr1.raw
  ...

  makeFiles: outformats= fff gff fasta
  openInput: type=feature/table part=0 
  openInput: name=chrI, type=feature/table, /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/fe
  atdump/chadofeat-chrI.tsv
  # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/fff/scer_chrI_sgdr1.fff (append=0)
  # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/gff/scer_chrI_sgdr1.gff (append=0)
  # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/fasta/scer_chrI_sgdr1.fasta (append=0)
  putFeats n=11, total=11, oid1=126750
  open dnafile /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna/scer_chrI_dna_sgdr1.raw, length
  =230210
  putFeats n=17, total=25, oid1=126761

  processChadoTable ndone = 1737
  openInput: type=feature/table part=18 
  openInput: nothing matches part=18
  makeFiles: done
  ]]></doc>

  <release id="1" rel="sgdr1"  dbname="sgdlite_20040519" date="20040519"
    relfull="sgdlite_20040519"
    release_url="/annot/sgdlite-release1.html"
    />
  <release id="0" rel="sgdr1"  dbname="sgdlite_20040519" date="20040519"
    relfull="sgdlite_20040519"
    release_url="/annot/sgdlite-release1.html"
    />
  <!-- need two+ release entries for config reader .. -->

  <db
    driver="Pg"
    name="sgdlite_20040519"
    host="localhost"
    port="7302"
    user=""
    password=""
    />

  <org>scer</org>
  <species>Saccharomyces cerevisiae</species>

  <dnadump
    path="dna/dna-\w+.raw"
    sql="select feature_id, residues from feature where uniquename = ?"
    />

  <featdump
    path="tmp/featdump/\w+.tsv"
    config="chadofeatsql"
    tag="feature_sql"
    type="feature_table"
    splitname="chadofeat"
    >
    <target>chromosomes</target>
    <target>features</target>
<!--  //none in sgdlite1//  <target>matches</target> -->
<!--  //none in sgdlite1//  <target>analysis</target> -->

    <!-- ? use this for sql arguments ? -->
    <!-- <target name="chromosomes" arg1="chromosome,chromosome_arm"/> -->
  </featdump>

  <fileset
    name="dna"
    path="dna/[\w\-\_]+.raw"
    sql="select feature_id, residues from feature where uniquename = ?"
    title="Chromosome dna"
    />

  <fileset
    name="fff"
    path="fff/[\w\-\_]+.fff"
    title="FFF Genome features"
    config="sgdfeatconf"
    handler="FeatureWriter"
    dogzip="1"
    />
  <fileset
    name="gff"
    path="gff/[\w\-\_]+.gff"
    title="GFF Genome features"
    dogzip="1"
    />

  <fileset
    name="gnomap"
    path="gnomap/[\w\-\_]+.tsv"
    config="tognomap"
    handler="GnomapWriter"
    title="GnoMap features"
    />
  <fileset
    name="gbrowse"
    path="gnomap/gbrowse.conf"
    config="gbrowseconf" 
    handler="GnomapWriter"
    withvars="1"
  />

  <fileset
    name="fasta"
    path="fasta/[\w\-\_]+.fasta"
    config="tofasta"
    handler="FastaWriter"
    dropnotes="synonym_2nd,synonym"
    makeall="1"
    perchr="0"
    dogzip="1"
    title="Genome feature sequence fasta" 
    />
  <fileset
    name="blast"
    path="blast/[\w\-\_]+.*"
    config="blastfiles"
    handler="BlastWriter"
    title="Blast indices" 
  />

  <idpattern></idpattern>

  <!-- feature sets to make fasta bulk files -->
  <featset>chromosome</featset>

  <featset>gene</featset>
  <featset>CDS</featset>
<!--   <featset>translation</featset> // no translations in sgdlite db -->
  <featset>tRNA</featset>
  <featset>miscRNA</featset>
  <featset>transposon</featset>
  <featset>gene_extended2000</featset>
  <featset>intergenic</featset>

  <!-- featmap moved to common bulkfiles/chadofeatconv
    but can override here if desired to add/replace/delete -->

<!-- sgd variant for this feature -->
  <featmap
    name="transposon"
    types="transposable_element_gene"
    typelabel="transposable_element"
    />

</opt>

--- NEW FILE: fbbulk-r3h.xml ---
<opt
  name="fbbulk-r3h"
  relid="h3"
  ROOT="${GMOD_ROOT}/"
  TMP="${GMOD_ROOT}/tmp"
  datadir="genomes/Drosophila_melanogaster"
  >

  <title>DHGP/FlyBase Heterochromatin rel 3.2</title>

  <doc name="README">
    D. melanogaster heterochomatin genome data from 
    Drosophila Heterochromatin Genome Project, www.dhgp.org
    Release 3.2
    See http://flybase.net/annot/dmel_het_release3.2.txt
  </doc>

  <!-- this could/should be from include flybase-release.xml 
    want default release/db xml that this overrides where needed?
  -->

  <release id="3" 
    rel="r3.2.1" 
    relfull="dmel_r3.2.1_07212004"
    dbname="chado_r3_2_27" 
    date="20040804" 
    release_url="/annot/release3.2.1.html"
    />
  <release id="h3" 
    rel="r3_2h"  
    dbname="chadohet_081604" 
    relfull="dmel_hetr3_2_08162004"
    date="20040821" 
    release_url="/annot/het-release3.2.html"
    />

  <db
    driver="Pg"
    name="dmelhet_chado"
    host="bugbane.bio.indiana.edu"
    port="7302"
    user="flybase"
    password=""
    />

  <org>dmel</org>
  <species>Drosophila melanogaster</species>

  <dnadump
    path="dna/[\w\-\_]+.raw"
    sql="select feature_id, residues from feature where uniquename = ?"
    type="dna_dump"
    />

  <featdump
    path="tmp/featdump/\w+.tsv"
    config="chadofeatsql"
    tag="feature_sql"
    type="feature_table"
    splitname="chadofeat"
    >
<!--     <target>fbids</target> // accessory table ; fixme -->
    <target>chromosomes</target>
    <target>features</target>
    <target>analysis</target>

    <!--  <target>matches</target>  -->
    <!-- drop matches for analysis which has same/better info 
      hetr32 matches has only generic 'alignment' and 'sim4:wrap...'
    -->

  </featdump>

  <fileset
    name="feature_table"
    path="tmp/featdump/\w+.tsv"
    config="chadofeatsql"
    tag="feature_sql"
    splitname="chadofeat"
    >
    <target>chromosomes</target>
    <target>features</target>
    <target>analysis</target>
  </fileset>

  <!-- 
    fileset are output formats w/ config, perl handler, location, etc.
    -->

  <fileset
    name="dna"
    path="dna/[\w\-\_]+.raw"
    input="dna_dump"
    sql="select feature_id, residues from feature where uniquename = ?"
    title="Chromosome dna"
    />

  <fileset
    name="fff"
    path="fff/[\w\-\_]+.fff"
    input="feature_table"
    title="FFF Genome features"
    config="dmelhetfeatconv"
    handler="FeatureWriter"
    dogzip="1"
    />

  <fileset
    name="gff"
    path="gff/[\w\-\_]+.gff"
    title="GFF Genome features"
    dogzip="1"
    />

  <fileset
    name="gnomap"
    path="gnomap/[\w\-\_]+.tsv"
    input="fff"
    config="tognomap"
    handler="GnomapWriter"
    title="GnoMap features"
    indexonly="0"
    />
  <fileset
    name="gbrowse"
    path="gnomap/gbrowse.conf"
    config="gbrowseconf" 
    handler="GnomapWriter"
    withvars="1"
    />

  <fileset
    name="fasta"
    path="fasta/[\w\-\_]+.fasta"
    input="fff"
    config="tofasta"
    handler="FastaWriter"
    dropnotes="synonym_2nd,synonym"
    makeall="1"
    perchr="1"
    dogzip="1"
    title="Genome feature sequence fasta" 
    />
  <fileset
    name="blast"
    path="blast/[\w\-\_]+.*"
    input="fasta"
    config="blastfiles"
    handler="BlastWriter"
    title="Blast indices" 
    />

<!--   <fileset -->
<!--     name="luseq" -->
<!--     input="fasta" -->
<!--     path="indices/lucene/seqs/.*" -->
<!--     title="lucene sequence indices" -->
<!--     config="lucegene" -->
<!--     handler="LucegeneWriter" -->
<!--     /> -->
<!--   <fileset -->
<!--     name="lufeat" -->
<!--     input="fff" -->
<!--     path="indices/lucene/gnomap/.*" -->
<!--     config="lucegene" -->
<!--     handler="LucegeneWriter" -->
<!--     title="lucene map indices" -->
<!--     /> -->

<idpattern>(FBgn|FBti)\d+</idpattern>

  <!-- feature sets to make fasta bulk files 
      .. change tag to fastaset ?
      .. add intergene set ?
  -->

  <featset>chromosome</featset>
<!--   <featset>scaffold</featset> ? no het scaffolds? -->

  <featset>gene</featset>
  <featset>mRNA</featset>
  <featset>CDS</featset>
  <featset>transcript</featset>
  <featset>translation</featset>
  <featset>tRNA</featset>
  <featset>miscRNA</featset>
  <featset>transposon</featset>
  <featset>pseudogene</featset>
  <featset>gene_extended2000</featset>

  <featset>intergenic</featset>

  <!--  no het gmodel parts ! -->
<!--   <featset>five_prime_UTR</featset> -->
<!--   <featset>three_prime_UTR</featset> -->
<!--   <featset>intron</featset> -->

  <!-- see featmap in chadofeatconf -->

</opt>

--- NEW FILE: gbrowseconf.xml ---
<opt
  name="gbrowseconf"
  date="20040826"
  >

 <!-- need to call XML::Simple w/ ${variables} below in doc

  ${species} ${relfull} ${date}
  ${datapath}
  ${default_location} 
  ${chromosomes} == ${examples} ??
  readConfig( 'gbrowseconfig', { Variables => \%featvars });

 -->

  <title>Gbrowse conf generator</title>
  <about>
  </about>

<!-- add gbrowse.conf file parts here; as per blastfiles.doc -->
  <doc name="dummy"></doc>

  <doc name="gbrowse" path="gnomap/gbrowse.conf">
  <!-- no content tag here -->
  <header title="header"><![CDATA[ 
# gbrowse config file for  genome maps

[GENERAL]
description = ${species} ${relfull} ${date}
datapath    = ${gnomapdir}

browser title = Genome Browser
help = /maps/gbrowse

# adaptor     = flybase::gmod::FFFdb
adaptor     =  

default_name = ${default_location} 
# X:100000-200000
default_range = 1-500000

# examples to show in the introduction
examples =  ${examples}

header = Genome Browser
footer = <hr>
 Adapted from GBrowse of the <a href="http://www.gmod.org/">
 Generic Model Organism Database Project.</a>

debug = 0

plugins = BatchDumper FastaDumper

#fixme ...
default features =  rev_ruler cytoband gene  tRNA noncodingRNA golden_path segment BAC transposable_element pseudogene
primary_feature = gene
scale_feature = cytoband

dumpviews = FastA GenBank GFF
dataviews = "Default" "Collapse All" "Expand All" 

# Web site configuration info
stylesheet  = /templates/gbrowse.css
buttons     = /common/perl/gbrowse_fb/images/buttons
tmpimages   = /tmp/gbrowse_fb

# max and default segment sizes for detailed view
max segment     = 1000001
default segment = 100000

## base range to expand around gene given ID lookup
expand_by = 20000

zoom levels = 100 200 1000 2000 5000 10000 20000 40000 100000 200000 500000 1000000

searchhelp = Search using Chromosome:base_start..end or FlyBase Gene ID.

#panel property
pad_left    = 20
pad_right   = 30
key_style = bottom
key bgcolor = whitesmoke
grid = 1
overview bgcolor = whitesmoke

# # where to link to when user clicks in detaild view

link = sub {
 my $f= shift;
 my $d= $f->primary_id() if $f->can('primary_id');
 $d= $f->source() unless($d);
 if ($d) {
 if ($d =~ /(FB\w\w\d+)/){ return '/cgi-bin/fbidq.html?'.$1; }
 elsif ($d =~ /^C[GR]/ ){ return '/cgi-bin/fbannq.html?acc='.$d; }
 elsif ($d =~ /\-\w+/ ){ return '/cgi-bin/fbannq.html?'.$d; }
 }
 if ($f->name()) { return '/cgi-bin/fbsymq.html?'.$f->name(); }
 return '' ; }

title = sub { my $f= shift;
	return $f->class .":". $f->info . " ". $f->seq_id .':'.$f->start."..".$f->end;
	}

# "automatic" classes to try when an unqualified identifier is given
automatic classes = Symbol Gene Clone

# put reversed features on same track or below ?
mix_strand = 1

# Default glyph settings
glyph       = generic
height      = 5
bgcolor			= palegoldenrod
fgcolor     = cyan
boxcolor 		= blue
label density = 20
bump density  = 50
overview label density = 160

image widths  = 450 640 800 950 1024
default width = 800
width				= 800

#---------- end [GENERAL] ------------------------
  ]]></header>

  <footer  title="footer"><![CDATA[
  ]]></footer> 
  </doc>

<fdef name="GENERIC"><![CDATA[
[GENERIC]
feature  = GENERIC
glyph    = segments
bgcolor  = lightslategray
fgcolor  = black
key      = GENERIC
citation = GENERIC
keygroup = "Analysis features"
link     =
]]></fdef>

<fdef name="rev_ruler"><![CDATA[
[rev_ruler]
glyph = ruler_arrow
label = 0
no_53_label = 1
label_align = center
tick = 1
no_tick_label = 0
units = K
link =
key  = Base ruler
keygroup = "  Genomic features"
]]></fdef>

<!-- ============   Genomic features  =========== -->
<fdef name="cytoband"><![CDATA[
[cytoband]
feature = cytoband
glyph = revcomp_arrow
both = 1
label_align = center
fgcolor = black
bgcolor = black
linewidth = 2
label = 1
label density = 10000
bump = 0
bump density = 0
key  = Cytologic band
keygroup = "  Genomic features"
citation     = Cytological bands on the polytene chromosomes
link = sub { my $f=shift; my $u=$ENV{REQUEST_URI};
	my $r= $f->ref.":".$f->to_FTstring;
	$u =~ s/\?.*$//; $u .= '?name='.$r.";doexpand=1";
	return $u;
	}
]]></fdef>

<fdef name="dna"><![CDATA[
[dna]
feature       = source
glyph         = dnabases
draw_dna      = 1
strand        = forward
label         = 0
label density = 0
key           = DNA sequence
keygroup = "  Genomic features"
link =
]]></fdef>

<fdef name="gene" feature="gene mRNA"><![CDATA[
[gene]
feature       = gene mRNA
glyph         = transcript2
bgcolor       = lightblue
fgcolor       = blue
secondary     = mRNA
secondary_bgcolor = mediumblue
secondary_fgcolor = mediumblue
highlight_color = red
higlighted = 1
label = 1
label density = 50
key           = Gene Model
keygroup = "  Genomic features"
citation     = Gene and mRNA (transcript) features  (annotation DB ; Chado)
link = sub {
 my $f= shift;
 my $d= $f->primary_id() if $f->can('primary_id');
 $d= $f->source() unless($d);
 if ($d) {
 if ($d =~ /(FB\w\w\d+)/){ return '/cgi-bin/fbidq.html?'.$1; }
 elsif ($d =~ /^C[GR]/ ){ return '/cgi-bin/fbannq.html?acc='.$d; }
 }
 if ($f->name()) { return '/cgi-bin/fbgenq.html?symbol='.$f->name(); }
 return '' ; }
]]></fdef>

<fdef name="CDS"><![CDATA[
[CDS]
feature  = CDS
glyph = segments
stranded     = 1
translation  = 1frame
bgcolor = palegoldenrod
fgcolor = goldenrod
label = 1
key  = CDS
citation     = Protein coding sequence (CDS or translation; annotation DB ; Chado)
keygroup = "  Genomic features"
link =
]]></fdef>

<fdef name="pseudogene"><![CDATA[
[pseudogene]
feature  = pseudogene
glyph = segments
strand_arrow  = 1
fgcolor       = orange
label = 1
key           = Pseudogene
keygroup = "  Genomic features...

[truncated message content]

[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles blastfiles.xml,NONE,1.1 chadofeatconv.xml,NONE,1.

[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles blastfiles.xml,NONE,1.1 chadofeatconv.xml,NONE,1.1 chadofeatsql.xml,NONE,1.1 dmelhetfeatconv.xml,NONE,1.1 fbbulk-r3h.xml,NONE,1.1 fbbulk-r4.xml,NONE,1.1 gbrowseconf.xml,NONE,1.1 sgdbulk1.xml,NONE,1.1 sgdfeatconf.xml,NONE,1.1 tognomap.xml,NONE,1.1