From: Don G. <don...@us...> - 2004-08-30 04:51:21
|
Update of /cvsroot/gmod/schema/GMODTools/conf/bulkfiles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13108/conf/bulkfiles Added Files: blastfiles.xml chadofeatconv.xml chadofeatsql.xml dmelhetfeatconv.xml fbbulk-r3h.xml fbbulk-r4.xml gbrowseconf.xml sgdbulk1.xml sgdfeatconf.xml tognomap.xml Log Message: more details worked out; good for dmel, tested w/ sgd chado dbs --- NEW FILE: dmelhetfeatconv.xml --- <opt name="dmelhetfeatconv" date="20040821" > <title>Chado DB Feature info</title> <about> Use this one for D.melanogaster heterochromatin genome db. These are configurations for converting chado feature table dumps to standard feature/sequence files. Much of below specifies how to process different features (tied to methods in ChadoFeatDump.pm These configs should be data-set independent. This works with, but is independent of SeqUtil2 configs. </about> <informat>feature_table</informat> <!-- feature_table ? --> <outformats>fff</outformats> <outformats>gff</outformats> <outformats>fasta</outformats> <!-- copied from db-release config files .. this is mostly common info, but db-release config can override @featset, %featmap --> <!-- feature sets to make fasta bulk files --> <featset>gene</featset> <featset>mRNA</featset> <featset>CDS</featset> <featset>transcript</featset> <featset>translation</featset> <featset>tRNA</featset> <featset>miscRNA</featset> <featset>transposon</featset> <featset>pseudogene</featset> <featset>gene_extended2000</featset> <featset>five_prime_UTR</featset> <featset>three_prime_UTR</featset> <featset>intron</featset> <featset>intergenic</featset> <featset>scaffold</featset> <!-- featmap for feature sets that need reprocessing, attributes: name = feature set name types = feature types, space delimited (in fff/gff) typelabel = type to use in output header, paired with types list subrange = expansion/extraction range to add to feature location; readseq syntax not yet supported, i.e. {start,end}+/-offset) fromdb = extract from chado database feature.residues field rather than chromosome dna file, for curated residues - transcript, translation) get_id = save id, dbxref fields for add to other (genemodel) features add_id = add id of parent feature (genemodel) note: translation type=protein is coded into various seq fetch programs --> <featmap name="gene" get_id="1" /> <featmap name="five_prime_UTR" add_id="gene" /> <featmap name="three_prime_UTR" add_id="gene" /> <featmap name="intron" add_id="gene" /> <featmap name="mRNA" add_id="gene" /> <featmap name="CDS" add_id="gene" /> <featmap name="translation" types="CDS" typelabel="protein" fromdb="1" /> <featmap name="transcript" types="mRNA" typelabel="transcript" fromdb="1" /> <featmap name="scaffold" types="golden_path_region" typelabel="scaffold" fromdb="1" /> <featmap name="transposon" types="transposable_element" typelabel="transposable_element" /> <featmap name="miscRNA" types="ncRNA snRNA snoRNA rRNA" /> <featmap name="gene_extended2000" types="gene" typelabel="gene_ex2000" subrange="-2000..2000" /> <featmap name="intergenic" types="gene" typelabel="intergenic" method="between" /> <rename_child_type>pseudogene|\w+RNA</rename_child_type> <mergematch></mergematch> <!-- remapType: append this name pattern to type --> <name2type_pattern></name2type_pattern> <!-- drop lengthy program.source from match name match_sim4_na_EST_complete_dros RE37642.5prime-2L_wgs2cex-na_EST.complete.dros also fix these ugly names repeating feature types; match:sim4:na_STS.dros BACN11E12-T7-211000022278175-na_STS.dros-sim4 cuttype=1 more cuttype names -211000022278591-aa_SP.real.dros-blastx match:blastx:aa_SPTR.dros ID|AAL57609|SPTR|AAL57609-211000022278591-aa_SPTR.dros-blastx match:blastx:aa_TR.real.dros Q967T2-211000022278591-aa_TR.real.dros-blastx match:blastx:aa_TR.real.dros Q94885-211000022279519-aa_TR.real.dros-blastx match:blastx:aa_SPTR.dros ID|AAL90081|SPTR|AAL90081-linked_7-aa_SPTR.dros-blastx match:blastx:aa_SPTR.dros ID|AAL48487|SPTR|AAL48487-AE003846R_extension-aa_SPTR.dros-blast match:groupest:na_DGC.dros RH25653.3prime_revcomp-AABU01000160-na_DGC.dros-groupest perl -pi.old \ -e's,\-AABU0\d+\S+dros\S+,,;' -e's,\_extension\S+,,;' \ -e's,\-linked_\S+,,;' \ -e's,\-2110000\d+\S+,,;' --> <mapname_pattern name="1match" type="^match.*" cuttype="1" from="null" to="null"/> <mapname_pattern name="2match" type="^match.*" from="\-?(2L|2R|3L|3R|4|X)[_\.].*$" to=""/> <mapname_pattern name="3match" type="^match.*" from="\_extension\S+$" to=""/> <mapname_pattern name="4match" type="^match.*" from="\-linked_\S+$" to=""/> <mapname_pattern name="5match" type="^match.*" from="\-2110000\d+\S+$" to=""/> <mapname_pattern name="cex" from="_wgs3_centromere_extension" to="_wgs2cex"/> <mapname_pattern name="dum" from="\-dummy\-" to=""/> <mapname_pattern name="tep" type="transposable_element_pred" from="JOSHTRANSPOSON\-" to=""/> <!-- <mapname_pattern name="psepred" type="^(gene|mRNA)" --> <!-- from="Contig[_\d]+" to=""/> --> <!-- <maptype_pattern name="simwrap" from="sim4:wrap.*" to="sim4:wrap"/> --> <!-- <maptype_pattern name="null" from="null" to="null"/> --> <!-- ## SONG/so Revision: 1.45 ## @is_a@oligo ; SO:0000696 ; SOFA:SOFA ; synonym:oligonucleotide ## 'so' is no longer valid ## old value: @is_a@so ; SO:1000000 ## options are limited: located_sequence_feature, SO:0000110 ?? ## in flybase, 'so' seems used for protein blast matches? ## segment not in this ## alt choices ... # @is_a@assembly ; SO:0000353 ; SOFA:SOFA # ** @is_a@golden_path ; SO:0000688 ; SOFA:SOFA << # ** @is_a@supercontig ; SO:0000148 ; SOFA:SOFA ; synonym:scaffold << # @is_a@tiling_path ; SO:0000472 ; SOFA:SOFA # @is_a@virtual_sequence ; SO:0000499 ; SOFA:SOFA # @is_a@chromosome ; SO:0000340 # @part_of@chromosome_arm ; SO:0000105 ## aug04: add new analysis features (HDP,RNAiHDP,fgenesh,) ## these are like exons but parent feature lacks featureloc ## - need to join together by object_oid/parent_oid and compute parent feature (has name) ## SO type.subtype should be match.program ## SONG: match, match_part match_set nucleotide_match cross_genome_match cDNA_match EST_match #? use '.' instead of '_' for part type? would that throw gnomap/gbrowse usage? probably --> <!-- flybase chado has these with fmin == 1-origin, others are 0-origin; why?? --> <origin_one chromosome_arm="1" chromosome_band="1" chromosome="1" /> <topsort chromosome_arm="1" chromosome="1" /> <segmentfeats BAC="1" chromosome_arm="1" chromosome_band="1" chromosome="1" golden_path_region="1" golden_path="1" segment="1" source="1" /> <!-- ## segment no longer valid SO; supercontig or golden_path are best --> <!-- simplefeat == segmentfeats + others --> <simplefeat BAC="1" chromosome_arm="1" chromosome_band="1" chromosome="1" gene="1" golden_path_region="1" golden_path="1" mature_peptide="1" oligonucleotide="1" point_mutation="1" pseudogene="1" region="1" repeat_region="1" segment="1" source="1" transcription_start_site="1" /> <dropname mRNA_genscan="1" mRNA_piecegenie="1" transcription_start_site="1" tRNA_trnascan="1" match_repeat_runner="1" match_repeat_runner_seg="1" /> <dropid cDNA_clone="1" chromosome_band="1" EST="1" exon="1" intron="1" oligonucleotide="1" processed_transcript="1" repeat_region="1" transcription_start_site="1" transposable_element_pred="1" match_repeat_runner="1" match_repeat_runner_seg="1" /> <dropfeat_fff CDS_exon="1" CDS="1" exon="1" five_prime_UTR="1" intron="1" remark="1" three_prime_UTR="1" match_repeat_runner="1" /> <dropfeat_gff CDS_exon="1" remark="1" /> <!-- skipaskid & segmentfeats ignored parent_oid ; dont try to make dubious, maybe huge compound feature --> <skipaskid point_mutation="1" region="1" repeat_region="1" transcription_start_site="1" match_part_repeat_runner_seg="1" match_part_repeat_runner="1" match_part_promoter="1" /> <hasdups three_prime_UTR="1" intron="1" five_prime_UTR="1" exon="1" match_repeat_runner_seg="1" match_repeat_runner="1" /> <!-- hasdups match_repeat_runner="1" ? drop match_repeat_runner for match_repeat_runner_seg match_repeat_runner="skip" --> <maptype match_part_repeat_runner="skip" match_part_repeat_runner_seg="skip" match_part_promoter="skip" protein="CDS" CDS="CDS_exon" five_prime_untranslated_region="five_prime_UTR" golden_path_region="golden_path" match_fgenesh="match_fgenesh" match_HDP="match_HDP" match_RNAiHDP="match_RNAiHDP" mRNA_genscan="mRNA_genscan" mRNA_piecegenie="mRNA_piecegenie" mRNA_trnascan="tRNA_trnascan" oligonucleotide="oligo" three_prime_untranslated_region="three_prime_UTR" transposable_element_pred="transposable_element_pred" /> <!-- so => "located_sequence_feature", ## leave in for now; no replacement for so ; SO:1000000 --> <maptype_gff> <!-- FIXME: ordered arrays here .. $type, $gffsource --> <!-- change to hash of hash : { fulltype => { gfftype => val, gffsource => val } } --> <match_part_fgenesh>match_part</match_part_fgenesh> <match_part_fgenesh>fgenesh</match_part_fgenesh> <match_part_RNAiHDP>match_part</match_part_RNAiHDP> <match_part_RNAiHDP>RNAiHDP</match_part_RNAiHDP> <mRNA_piecegenie>mRNA</mRNA_piecegenie> <mRNA_piecegenie>piecegenie</mRNA_piecegenie> <match_part_HDP>match_part</match_part_HDP> <match_part_HDP>HDP</match_part_HDP> <match_HDP>match</match_HDP> <match_HDP>HDP</match_HDP> <transposable_element_pred>transposable_element</transposable_element_pred> <transposable_element_pred>predicted</transposable_element_pred> <tRNA_trnascan>tRNA</tRNA_trnascan> <tRNA_trnascan>trnascan</tRNA_trnascan> <match_fgenesh>match</match_fgenesh> <match_fgenesh>fgenesh</match_fgenesh> <mRNA_genscan>mRNA</mRNA_genscan> <mRNA_genscan>genscan</mRNA_genscan> <match_RNAiHDP>match</match_RNAiHDP> <match_RNAiHDP>RNAiHDP</match_RNAiHDP> </maptype_gff> </opt> --- NEW FILE: tognomap.xml --- <opt name="tognomap" date="20040821" > <title>Genome Feature merge (genomic and cytology features)</title> <about> These configs help merge two or more fff feature sets for use with gnomap,gbrowse(fb) and such feature file uses Add these to per-release-db configs. Cut from mergeflyfeats4.pl and .xml config, d.gilbert, aug04 </about> <informat>fff</informat> <informat>dna</informat> <seqids path="tmp/chado-fb.ids" date="20040721" noIDmap="cytowalk|protein|mRNA|CDS|EST|cDNA|oligo|processed|repeat|sim4" indexidtype="^(gene|pseudogene|\w+RNA)" indexidpattern="[A-Z]{2}gn\d+" /> <seqfeat path="fff/[\w\-\_]+.fff" date="20040804" seqfeat="1" name="Annotation feature parts" > <drop>chromosome_band</drop> <drop>remark</drop> <drop>source</drop> </seqfeat> <cytofeat path="cytomap/(cyto-features|cytofeat)-[\w\-\_].tsv" name="cyto-feature parts" cytofeat="1" date="20040707" /> <sorsa path="cytomap/sorsa.txt" name="Table of genome:cytology mapping" date="20020817" /> </opt> --- NEW FILE: fbbulk-r4.xml --- <opt name="fbbulk-r4" relid="4" ROOT="${GMOD_ROOT}/" TMP="${GMOD_ROOT}/tmp" datadir="genomes/Drosophila_melanogaster" > <!-- datadir="data2/fban" --> <title>FlyBase Chado DB r4.0 test</title> <about> Configurations to extract feature and sequence data for bulk files from FlyBase chado databases, including various release information. </about> <doc name="README"> D. melanogaster euchromatin genome data from FlyBase Release 4.0t. See http://flybase.net/annot/dmel_4.txt </doc> <release id="4" rel="r4.0t" dbname="dmel_chado" relfull="dmel_r4_0t_20040821" date="20040821" release_url="/annot/release4.html" /> <release id="3" rel="r3.2.1" relfull="dmel_r3.2.1_07212004" dbname="chado_r3_2_27" date="20040804" release_url="/annot/release3.2.1.html" /> <release id="2" rel="r3.2.0" relfull="dmel_r3.2.0_03162004" dbname="chado_r3.2_18" date="20040321" release_url="/annot/release3.2.html" /> <release id="1" rel="r3.1.0" relfull="dmel_r3.1.0_12182003" dbname="chado_r3.1" date="20031218" release_url="/annot/release3.1.html" /> <release id="h3" rel="r3_2h" dbname="chadohet_081604" relfull="dmel_hetr3_2_08162004" date="20040821" release_url="/annot/het-release3.2.html" /> <release id="p1" rel="dpse_r1.0" relfull="dpse_r1_0_20040821" dbname="dmelr3_2_dpser1_0" date="20040821" release_url="/annot/dpse-release1.html" /> <!-- db.name is release-dependent ; use above --> <db driver="Pg" name="dmel_chado" host="bugbane.bio.indiana.edu" port="7302" user="flybase" password="" /> <org>dmel</org> <species>Drosophila melanogaster</species> <!-- get chromosomes from featdump chromosomes --> <chromosomes>X</chromosomes> <chromosomes>2L</chromosomes> <chromosomes>2R</chromosomes> <chromosomes>3L</chromosomes> <chromosomes>3R</chromosomes> <chromosomes>4</chromosomes> <!-- dnadump FIXME; need to extract scaffold dna from chado db --> <dnadump path="dna/[\w\-\_]+.raw" sql="select feature_id, residues from feature where uniquename = ?" /> <featdump path="tmp/featdump/\w+.tsv" config="chadofeatsql" tag="feature_sql" type="feature_table" splitname="chadofeat" > <!-- <target>fbids</target> // accessory table ; fixme --> <target>chromosomes</target> <target>features</target> <target>matches</target> <target>analysis</target> <!-- ? use this for sql arguments ? --> <!-- <target name="chromosomes" arg1="chromosome,chromosome_arm"/> --> </featdump> <!-- <dnafiles --> <!-- path="dna/[\w\-\_]+.raw" --> <!-- name="Chromosome dna" --> <!-- /> --> <fileset name="dna" path="dna/[\w\-\_]+.raw" sql="select feature_id, residues from feature where uniquename = ?" title="Chromosome dna" /> <!-- <featfiles --> <!-- path="fff/[\w\-\_]+.fff" --> <!-- name="Genome features" --> <!-- config="chadofeatconv" --> <!-- /> --> <fileset name="fff" path="fff/[\w\-\_]+.fff" title="FFF Genome features" config="chadofeatconv" handler="FeatureWriter" dogzip="1" /> <fileset name="gff" path="gff/[\w\-\_]+.gff" title="GFF Genome features" dogzip="1" /> <fileset name="gnomap" path="gnomap/[\w\-\_]+.tsv" config="tognomap" handler="GnomapWriter" title="GnoMap features" indexonly="0" /> <fileset name="gbrowse" path="gnomap/gbrowse.conf" config="gbrowseconf" handler="GnomapWriter" withvars="1" /> <!-- <fastafiles --> <!-- path="fasta/[\w\-\_]+.fasta" --> <!-- name="Genome feature sequence fasta" --> <!-- dropnotes="synonym_2nd,synonym" --> <!-- dogzip="1" --> <!-- /> --> <!-- <blastfiles --> <!-- path="blast/[\w\-\_]+.*" --> <!-- name="Blast indices" --> <!-- config="blastfiles" --> <!-- /> --> <fileset name="fasta" path="fasta/[\w\-\_]+.fasta" config="tofasta" handler="FastaWriter" dropnotes="synonym_2nd,synonym" makeall="1" perchr="1" dogzip="1" title="Genome feature sequence fasta" /> <fileset name="blast" path="blast/[\w\-\_]+.*" config="blastfiles" handler="BlastWriter" title="Blast indices" /> <!-- use to add parent ids to features FBgn|FBti ; NOT these - FBan|CG|CR --> <idpattern>(FBgn|FBti)\d+</idpattern> <!-- FIXME new parse steps put all individual feats to fasta ; change back to use featset info --> <!-- feature sets to make fasta bulk files --> <!-- ?? do we want mRNA = computed AND transcript = curated dna ? --> <featset>gene</featset> <featset>mRNA</featset> <featset>CDS</featset> <featset>transcript</featset> <featset>translation</featset> <featset>tRNA</featset> <featset>miscRNA</featset> <featset>transposon</featset> <featset>pseudogene</featset> <featset>gene_extended2000</featset> <featset>five_prime_UTR</featset> <featset>three_prime_UTR</featset> <featset>intron</featset> <featset>intergenic</featset> <featset>scaffold</featset> <!-- featmap moved to common bulkfiles/chadofeatconv but can override here if desired to add/replace/delete --> <featmap name="translation" types="CDS" typelabel="protein" fromdb="1" /> <featmap name="miscRNA" types="ncRNA snRNA snoRNA rRNA" /> <featmap name="gene_extended2000" types="gene" typelabel="gene_ex2000" subrange="-2000..2000" /> <featmap name="intergenic" types="gene" typelabel="intergenic" method="between" /> </opt> --- NEW FILE: chadofeatsql.xml --- <opt name="chadofeatsql" date="20040821" > <title>FlyBase Chado DB SQL</title> <about> This is a collection of Chado DB SQL calls to extract all basic genome features, into intermediate feature_table form arm fmin fmax strand type name id oid attr_type attribute 2L 0 305900 1 golden_path_region AE003590 AE003590 1273141 dbxref Gadfly:AE003590 2L 6364 6366 1 transcription_start_site 6365-6366-AE999999.Fake-dummy-promoter 6365-6366-AE999999.Fake-dummy-promoter 1273564 2L 6773 6808 1 exon NULL:960558 1273721 parent_oid 1273720:1 2L 6773 9276 mRNA 6773,9276-AE999999.Fake-dummy-piecegenie NULL:9605 There is a tag feature_sql, type="feature_table" for each group of features needing separate SQL: chromosome/super-contigs, gene_models, matches, analyses, syntenic features .. Logic in Bulkfiles::FeatureWriter then merges, selects/reformats these feature_tables and writes bulk files per chromosome. One can also attach post-processing scripts (see matches below). </about> <feature_sql name="fbids" type="list" output="chado-fb.ids"> <sql> -- used for flybase to exclude cytologic features with same genome feature id SELECT accession FROM dbxref WHERE accession like '%FBgn%' or accession like '%FBti%'; </sql> </feature_sql> <feature_sql name="chromosomes" type="feature_table" output="chromosomes.tsv"> <sql> -- get chromosome/arm/supercontig/... lengths/names -- add get-residues to file SELECT CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, '1' as fmin, armft.seqlen as fmax, '0' as strand, featcv.name as type, armft.name as name, armft.uniquename as id, armft.feature_id as oid, 'species' as attr_type, org.genus || '_' || org.species AS attribute FROM feature armft, organism org, cvterm featcv WHERE featcv.name in ( 'chromosome', 'chromosome_arm') -- need more choices and armft.organism_id = org.organism_id and armft.type_id = featcv.cvterm_id ORDER BY arm ; </sql> </feature_sql> <feature_sql name="features" type="feature_table" output="features.tsv"> <sql> -- standard features (excluding matches, analyses) SELECT distinct CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, armloc.fmin, armloc.fmax, armloc.strand, targcv.name as type, targft.name as name, targft.uniquename as id, targft.feature_id as oid, attr.type as attr_type, attr.attribute FROM feature armft, feature targft left outer join gffattr_gmodel attr on (targft.feature_id = attr.feature_id), featureloc armloc, cvterm targcv WHERE NOT (targcv.name in ('match')) -- add orthologous_region, syntenic_region and targft.type_id = targcv.cvterm_id and armft.feature_id = armloc.srcfeature_id and targft.feature_id = armloc.feature_id ORDER BY arm, armloc.fmin, targcv.name ; </sql> </feature_sql> <feature_sql name="matches" type="feature_table" output="matches.tsv"> <sql> -- use this to get the paired genome/target match items w/ attr (or null) -- -- jun04 - need to filter out apollo dupl. evidence for -- match features, type = transposable_element_insertion_site SELECT distinct CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, armloc.fmin, armloc.fmax, armloc.strand, targcv.name as type, targft.name as name, targft.uniquename as id, targft.feature_id as oid, attr.type as attr_type, attr.attribute FROM feature matchft, feature armft, feature targft left outer join gffattr_match attr on (targft.feature_id = attr.feature_id), featureloc armloc, featureloc targloc, cvterm targcv, cvterm matchcv, cvterm armcv -- tested speed of this w/ cv.names versus cvterm_id number inserted -- pg is smart enough to optimize cv name lookup once - leave as is WHERE matchcv.name = 'match' and matchcv.cvterm_id = matchft.type_id and armcv.name in ( 'chromosome', 'chromosome_arm') and armcv.cvterm_id = armft.type_id and targft.type_id = targcv.cvterm_id and targft.feature_id != armft.feature_id -- separate paired features here - keep chr-arm loc, target types/attr and matchft.feature_id = armloc.feature_id and armft.feature_id = armloc.srcfeature_id and matchft.feature_id = targloc.feature_id and targft.feature_id = targloc.srcfeature_id ORDER BY arm, armloc.fmin, targcv.name ; </sql> </feature_sql> <feature_sql name="analysis" type="feature_table" output="analysis.tsv"> <!-- ant-like ; is this usable? --> <target name="main" depends="query,postprocess"/> <target name="query" action="sql" /> <target name="postprocess" action="rdump" description="fix parents w/o featureloc from kid values" /> <!-- this works: perl -i.old rdump $r/tmp/featdump/analysis.tsv --> <script name="rdump" type="postprocess" shell="perl -i" language="perl"> <![CDATA[ while(<>) { my @v= split/[\t]/; $d= $v[-1]; if ($v[0] eq "arm") { print join("\t",@v); next; } rdump() if ($ld && $d ne $ld); push(@r,\@v); $ld= $d; rdump() if (eof); } sub rdump() { $p=$r[0]; $k= $r[1]; ($a,$b,$e,$s)= @{$k}[0..3]; foreach $k (@r[2..$#r]) { $kb=$$k[1]; $ke=$$k[2]; $b=$kb if ($kb<$b); $e=$ke if ($ke>$e); } $$p[0]=$a; $$p[1]=$b; $$p[2]=$e; $$p[3]=$s; pop(@$p); $$p[-1]="\n"; $r[0]=$p; foreach $i (1..$#r) { $r[$i]->[4] =~ s/^match_/match_part_/; } foreach $r (@r) { print join("\t",@$r); } @r=(); } ]]> </script> <!-- want sql tag at top level of feature_sql like others --> <sql> -- select all an features of right type, with arm featureloc when avail -- add analysis.sourcename where needed (ignore 'dummy'); -- change 'match_' leading type ; drop or make option SELECT armft.arm, armft.fmin, armft.fmax, armft.strand, -- 'match_' || an.program as type, 'match_' || an.program || an.sourcename as type, targft.name as name, targft.uniquename as id, targft.feature_id as oid, CASE WHEN attr.type IS NULL THEN text('object_oid') ELSE attr.type END as attr_type, CASE WHEN attr.attribute IS NULL THEN text(targft.feature_id) ELSE attr.attribute END as attribute FROM feature targft left outer join gffatts_evid attr on (targft.feature_id = attr.feature_id), analysisfeature anf left outer join gffatts_anfloc armft on (anf.feature_id = armft.feature_id), analysis an WHERE an.program in ('HDP','RNAiHDP','fgenesh') -- FIXME add params here and an.analysis_id = anf.analysis_id and anf.feature_id = targft.feature_id ; </sql> </feature_sql> <feature_sql name="views" type="view"> <sql> -- from sequence-gff-views.sql -- dont need all of these attribs; -- dbxref: yes, cvterm: no, synonym: no?, pub: no, -- featureprop - want some: cyto_range, gbunit?(no) -- add dbxref_2nd, aug04 for dbxref need to know primary/secondary > feature_dbxref.is_current ! -- attr view for regular features which may have parent features (exons) CREATE OR REPLACE VIEW gffattr_gmodel ( feature_id, type, attribute ) AS SELECT feature_id, CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, d.name || ':' || s.accession AS attribute FROM dbxref s, feature_dbxref fs, db d WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id UNION ALL SELECT feature_id, cv.name AS type, fp.value AS attribute FROM featureprop fp, cvterm cv WHERE fp.type_id = cv.cvterm_id and (cv.name = 'cyto_range' or cv.name = 'gbunit') -- keep this restriction - other props not useful here: comments, sp_comment, owner ... UNION ALL SELECT feature_id, CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, s.synonym_sgml AS attribute FROM feature_synonym fs, synonym s WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE UNION ALL -- add parent feat ids for exons, etc. SELECT pk.subject_id AS feature_id, 'parent_oid' AS type, CASE WHEN pk.rank IS NULL THEN text(pk.object_id) ELSE pk.object_id || ':' || pk.rank END FROM feature_relationship pk ; GRANT SELECT ON gffattr_gmodel TO PUBLIC; -- attr view for match features CREATE OR REPLACE VIEW gffattr_match ( feature_id, type, attribute ) AS SELECT feature_id, CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, d.name || ':' || s.accession AS attribute FROM dbxref s, feature_dbxref fs, db d WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id UNION ALL SELECT feature_id, cv.name AS type, fp.value AS attribute FROM featureprop fp, cvterm cv WHERE fp.type_id = cv.cvterm_id and (cv.name = 'cyto_range' or cv.name = 'gbunit') UNION ALL SELECT feature_id, CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, s.synonym_sgml AS attribute FROM feature_synonym fs, synonym s WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE ; GRANT SELECT ON gffattr_match TO PUBLIC; -- attrib view for cross-species feats (synteny, orthology) CREATE OR REPLACE VIEW gffattr_synteny ( feature_id, type, attribute ) AS -- parent feat ids for source supercontigs, etc. SELECT pk.subject_id, 'parent_oid' as type, CASE WHEN pk.rank IS NULL THEN text(pk.object_id) ELSE pk.object_id || ':' || pk.rank END FROM feature_relationship pk ; GRANT SELECT ON gffattr_synteny TO PUBLIC; -- for analysis features CREATE OR REPLACE VIEW gffatts_anfloc ( feature_id, arm, fmin, fmax, strand ) AS SELECT armloc.feature_id, CASE WHEN armft.name IS NULL THEN armft.uniquename ELSE armft.name END AS arm, armloc.fmin, armloc.fmax, armloc.strand FROM feature armft, featureloc armloc, cvterm armcv WHERE armft.type_id = armcv.cvterm_id and armcv.name in ( 'chromosome', 'chromosome_arm') and armft.feature_id = armloc.srcfeature_id ; GRANT SELECT ON gffatts_anfloc TO PUBLIC; -- for analysis features CREATE OR REPLACE VIEW gffatts_evid ( feature_id, type, attribute ) AS SELECT pk.subject_id, text('parent_oid'), text(pk.object_id) FROM feature_relationship pk ; GRANT SELECT ON gffatts_evid TO PUBLIC; </sql> </feature_sql> </opt> --- NEW FILE: chadofeatconv.xml --- <opt name="chadofeatconv" date="20040821" > <title>Chado DB Feature info</title> <about> These are configurations for converting chado feature table dumps to standard feature/sequence files. Much of below specifies how to process different features (tied to methods in ChadoFeatDump.pm These configs should be data-set independent. This works with, but is independent of SeqUtil2 configs. </about> <informat>feature_table</informat> <outformats>fff</outformats> <outformats>gff</outformats> <outformats>fasta</outformats> <!-- copied from db-release config files .. this is mostly common info, but db-release config can override @featset, %featmap --> <!-- feature sets to make fasta bulk files --> <featset>gene</featset> <featset>mRNA</featset> <featset>CDS</featset> <featset>transcript</featset> <featset>translation</featset> <featset>tRNA</featset> <featset>miscRNA</featset> <featset>transposon</featset> <featset>pseudogene</featset> <featset>gene_extended2000</featset> <featset>five_prime_UTR</featset> <featset>three_prime_UTR</featset> <featset>intron</featset> <featset>intergenic</featset> <featset>scaffold</featset> <!-- featmap for feature sets that need reprocessing, attributes: name = feature set name types = feature types, space delimited (in fff/gff) typelabel = type to use in output header, paired with types list subrange = expansion/extraction range to add to feature location; readseq syntax not yet supported, i.e. {start,end}+/-offset) fromdb = extract from chado database feature.residues field rather than chromosome dna file, for curated residues - transcript, translation) get_id = save id, dbxref fields for add to other (genemodel) features add_id = add id of parent feature (genemodel) note: translation type=protein is coded into various seq fetch programs --> <featmap name="gene" get_id="1" /> <featmap name="five_prime_UTR" add_id="gene" /> <featmap name="three_prime_UTR" add_id="gene" /> <featmap name="intron" add_id="gene" /> <featmap name="mRNA" add_id="gene" /> <featmap name="CDS" add_id="gene" /> <featmap name="translation" types="CDS" typelabel="protein" fromdb="1" /> <featmap name="transcript" types="mRNA" typelabel="transcript" fromdb="1" /> <featmap name="scaffold" types="golden_path_region" typelabel="scaffold" fromdb="1" /> <featmap name="transposon" types="transposable_element" typelabel="transposable_element" /> <featmap name="miscRNA" types="ncRNA snRNA snoRNA rRNA" /> <featmap name="gene_extended2000" types="gene" typelabel="gene_ex2000" subrange="-2000..2000" /> <featmap name="intergenic" types="gene" typelabel="intergenic" method="between" /> <rename_child_type>pseudogene|\w+RNA</rename_child_type> <mergematch></mergematch> <!-- ## SONG/so Revision: 1.45 ## @is_a@oligo ; SO:0000696 ; SOFA:SOFA ; synonym:oligonucleotide ## 'so' is no longer valid ## old value: @is_a@so ; SO:1000000 ## options are limited: located_sequence_feature, SO:0000110 ?? ## in flybase, 'so' seems used for protein blast matches? ## segment not in this ## alt choices ... # @is_a@assembly ; SO:0000353 ; SOFA:SOFA # ** @is_a@golden_path ; SO:0000688 ; SOFA:SOFA << # ** @is_a@supercontig ; SO:0000148 ; SOFA:SOFA ; synonym:scaffold << # @is_a@tiling_path ; SO:0000472 ; SOFA:SOFA # @is_a@virtual_sequence ; SO:0000499 ; SOFA:SOFA # @is_a@chromosome ; SO:0000340 # @part_of@chromosome_arm ; SO:0000105 ## aug04: add new analysis features (HDP,RNAiHDP,fgenesh,) ## these are like exons but parent feature lacks featureloc ## - need to join together by object_oid/parent_oid and compute parent feature (has name) ## SO type.subtype should be match.program ## SONG: match, match_part match_set nucleotide_match cross_genome_match cDNA_match EST_match #? use '.' instead of '_' for part type? would that throw gnomap/gbrowse usage? probably --> <!-- remapType: append this name pattern to type --> <name2type_pattern>[-_](genscan|piecegenie|twinscan|genewise|pred|trnascan)</name2type_pattern> <!-- flybase chado has these with fmin == 1-origin, others are 0-origin; why?? --> <origin_one chromosome_arm="1" chromosome_band="1" chromosome="1" /> <topsort chromosome_arm="1" chromosome="1" /> <segmentfeats BAC="1" chromosome_arm="1" chromosome_band="1" chromosome="1" golden_path_region="1" golden_path="1" segment="1" source="1" /> <!-- ## segment no longer valid SO; supercontig or golden_path are best --> <!-- simplefeat == segmentfeats + others --> <simplefeat BAC="1" chromosome_arm="1" chromosome_band="1" chromosome="1" gene="1" golden_path_region="1" golden_path="1" mature_peptide="1" oligonucleotide="1" point_mutation="1" pseudogene="1" region="1" repeat_region="1" segment="1" source="1" transcription_start_site="1" /> <dropname mRNA_genscan="1" mRNA_piecegenie="1" transcription_start_site="1" tRNA_trnascan="1" /> <dropid cDNA_clone="1" chromosome_band="1" EST="1" exon="1" intron="1" oligonucleotide="1" processed_transcript="1" repeat_region="1" transcription_start_site="1" transposable_element_pred="1" /> <dropfeat_fff CDS_exon="1" CDS="1" exon="1" five_prime_UTR="1" intron="1" remark="1" three_prime_UTR="1" /> <dropfeat_gff CDS_exon="1" remark="1" /> <skipaskid point_mutation="1" region="1" repeat_region="1" transcription_start_site="1" /> <hasdups three_prime_UTR="1" intron="1" five_prime_UTR="1" exon="1" /> <maptype protein="CDS" CDS="CDS_exon" five_prime_untranslated_region="five_prime_UTR" golden_path_region="golden_path" match_fgenesh="match_fgenesh" match_HDP="match_HDP" match_RNAiHDP="match_RNAiHDP" mRNA_genscan="mRNA_genscan" mRNA_piecegenie="mRNA_piecegenie" mRNA_trnascan="tRNA_trnascan" oligonucleotide="oligo" three_prime_untranslated_region="three_prime_UTR" transposable_element_pred="transposable_element_pred" /> <!-- so => "located_sequence_feature", ## leave in for now; no replacement for so ; SO:1000000 --> <mapname_pattern name="null" from="null" to="null"/> <mapname_pattern name="dum" from="\-dummy\-" to=""/> <mapname_pattern name="tep" type="transposable_element_pred" from="JOSHTRANSPOSON\-" to=""/> <!-- <maptype_pattern name="null" from="null" to="null"/> --> <!-- <maptype_pattern name="simwrap" from="sim4:wrap.*" to="sim4:wrap"/> --> <maptype_gff> <!-- FIXME: ordered arrays here .. $type, $gffsource --> <!-- change to hash of hash : { fulltype => { gfftype => val, gffsource => val } } --> <match_part_fgenesh>match_part</match_part_fgenesh> <match_part_fgenesh>fgenesh</match_part_fgenesh> <match_part_RNAiHDP>match_part</match_part_RNAiHDP> <match_part_RNAiHDP>RNAiHDP</match_part_RNAiHDP> <mRNA_piecegenie>mRNA</mRNA_piecegenie> <mRNA_piecegenie>piecegenie</mRNA_piecegenie> <match_part_HDP>match_part</match_part_HDP> <match_part_HDP>HDP</match_part_HDP> <match_HDP>match</match_HDP> <match_HDP>HDP</match_HDP> <transposable_element_pred>transposable_element</transposable_element_pred> <transposable_element_pred>predicted</transposable_element_pred> <tRNA_trnascan>tRNA</tRNA_trnascan> <tRNA_trnascan>trnascan</tRNA_trnascan> <match_fgenesh>match</match_fgenesh> <match_fgenesh>fgenesh</match_fgenesh> <mRNA_genscan>mRNA</mRNA_genscan> <mRNA_genscan>genscan</mRNA_genscan> <match_RNAiHDP>match</match_RNAiHDP> <match_RNAiHDP>RNAiHDP</match_RNAiHDP> </maptype_gff> </opt> --- NEW FILE: blastfiles.xml --- <opt name="blastfiles" date="20040821" blasthome="${ARGOS_ROOT}/common/servers/blast/Bin" formatdb="${ARGOS_ROOT}/common/servers/blast/Bin/formatdb" formatdbopts="-o F " isprot_patt="(translation|aa_)" > <title>Blast index writer</title> <about> These are configurations for updating blast indices, rc files, html pages, given genome fasta input files and fasta/feature groups. </about> <informat>fasta</informat> <outformats>ncbi</outformats> <outformats> </outformats> <!-- feature sets to make blast indices --> <blastset>gene</blastset> <blastset>transcript</blastset> <blastset>translation</blastset> <blastset>tRNA</blastset> <blastset>miscRNA</blastset> <!-- <blastset>transposon</blastset> --> <blastset>pseudogene</blastset> <blastset>intergenic</blastset> <blastset>scaffold</blastset> <blastset>chromosome</blastset> <blastmap name="miscRNA" types="ncRNA snRNA snoRNA rRNA" /> <doc name="dbrc" path="blast/blast.rc"><![CDATA[ # blast.rc # This is dynamic configuration file for NCBI WWW BLAST service # # Number of CPUs to use for a single request # NumCpuToUse 1 # # Here are list of combinations program/database, # that allowed by BLAST service. Format: <program> <db> <db> ... # blastn blastp blastx tblastn tblastx ]]></doc> <!-- also need to write some .nal, .pal ncbi blast files to join other dbs .. put tag into individual ones to join, eg. nalfile="na_all na_est" --> <doc name="dbselect" path="blast/blastdb.in"> <!-- database <select> include file html; no content tag here --> <!-- change this to be entire blast html form instead of using shtml #include ? --> <header title=""><![CDATA[ ]]></header> <footer title=""><![CDATA[ ]]></footer> </doc> <doc name="dbtable" path="blast/blast_databases.html"> <!-- no content tag here --> <!-- optional <tableheader> four fields are dbname, dbfile, update-date, description --> <header title=""><![CDATA[<HTML> <HEAD> <TITLE>${species} BLAST Databases</TITLE> </HEAD> <BODY> <h1> <I>${species}</I> Genome BLAST </h1> <h2> Data release <a href="${release_url}">${rel}</a> <br>Data sets for BLAST search </h2> <HR> <B><LI>Database</B> - Choose a database     <FONT COLOR="red"><B>required</B></FONT> <BR><BR> <TABLE BORDER=1 cellpadding="5" cellspacing="1"> ]]></header> <footer title=""><![CDATA[ <TR> <TD colspan=4 align=left> <a name=key> <B>key: AA = protein, NT = nucleotide. To download databases, visit the <A HREF="/annot/" target=_self>Sequence Download Page</A></B> </TD> </TR> </TABLE> </BODY> </HTML> ]]></footer> </doc> <!-- these db entries for databases.html --> <!-- want sorted in given order; as array not hash --> <!-- FIXME: $org name ... --> <db name="na_all" title=" All ${species} sequences (NT)"><![CDATA[ Euchromatin, heterochromatin, and predicted genes; genomic clones, ESTs, STSs, P-element insertion sites, and public sequences from GenBank (see below) ]]> </db> <!-- these are generic for any organism --> <db name="chromosome" title=" All genome chromosome arms (NT)" nalfile="na_all" ><![CDATA[ Whole genome by chromosome arms ]]> </db> <db name="transcript" title=" Predicted genes (NT)" nalfile="na_all" ><![CDATA[ Complete transcript (CDS + UTR) for all of the genes ]]> </db> <db name="translation" title=" Predicted proteins (AA)"><![CDATA[ Peptide translations for all of the genes ]]> </db> <!-- fixme; need to extract scaffold dna from chado db --> <db name="scaffold" title=" Euchromatin and heterochromatin scaffolds (NT)" nalfile="na_all" ><![CDATA[ Genomic sequence for the euchromatic chromosome arms, and from the heterochromatin assembly, divided into ~350kb GenBank scaffolds ]]> </db> <db name="intergenic" title="Intergenic sequence (NT)" nalfile="na_all" ><![CDATA[ Intergeneic regions -- genome sequence between genes ]]> </db> <db name="transposon" skip="1" title="Transposon insertions (NT)"><![CDATA[ Transposon insertion sites in genome ]]> </db> <!-- these are flybase specific --> <db name="na_cDNA" title=" BDGP full-length cDNAs (NT)" nalfile="na_all" org="dmel" ><![CDATA[ <I>${species}</I> full-length cDNAs from the BDGP ]]> </db> <db name="na_geno_clones" title=" BDGP/EDGP genomic clones (NT)" nalfile="na_all" org="dmel" ><![CDATA[ P1, BAC, and cosmid sequences from the BDGP & EDGP that have been submitted to GenBank -- this does not include whole genome shotgun sequence ]]> </db> <!-- <db name="dmel_hetero_scaffolds" title=" Heterochromatin scaffolds (NT)"><![CDATA[ --> <!-- Genomic sequence from <a href="http://www.celera.com" target="_self">Celera</a> --> <!-- for the heterochromatin from the WGS3 assembly --> <!-- ]]> --> <!-- </db> --> <db name="dmel_aa_swall" title=" SwissProt and TREMBL proteins (SWALL) (AA)"><![CDATA[ A non-redundant combination of the <i>${species}</i> entries from the SwissProt+TrEMBL+TrEMBLNew databases (SWALL) ]]> </db> <db name="dmel_aa_uniprot" title=" UniProt proteins (AA)"><![CDATA[ <i>${species}</i> proteins from UniProt ]]> </db> <db name="dmel_aa_refseq" title=" RefSeq proteins (AA)"><![CDATA[ <i>${species}</i> proteins from RefSeq ]]> </db> <db name="dmel_na_refseq" title=" RefSeq Sequences (NT)"><![CDATA[ <i>${species}</i> nucleic sequences from RefSeq ]]> </db> <db name="na_gb" title=" GenBank (without BDGP, EDGP, Celera or dbEST) (NT)" nalfile="na_all" ><![CDATA[ <i>${species}</i> sequences collected from public sequence databases (but without BDGP, EDGP, Celera, or dbEST sequences) ]]> </db> <db name="na_est" title="ESTs from BDGP and dbEST cDNAs (NT)" org="dmel" ><![CDATA[ <I>Drosophila</I> Expressed Sequence Tags from the BDGP and from dbEST cDNAs (redundant) ]]> </db> <db name="na_EST" part_of="na_est" title="" skip="1" nalfile="na_all na_est" ><![CDATA[ Expressed Sequence Tags ]]> </db> <db name="na_dbEST" part_of="na_est" title="" skip="1" nalfile="na_all na_est" ><![CDATA[ dbEST cDNAs ]]> </db> <db name="na_pe" title=" P element insertion sites (NT)" nalfile="na_all" org="dmel"><![CDATA[ Genomic sequence flanking BDGP P-element insertions (BDGP) ]]> </db> <db name="na_re" title=" Repeats (NT)" nalfile="na_all" ><![CDATA[ a curated set of <i>${species}</i> known repeats ]]> </db> <db name="na_STS" title=" STSs (NT)" nalfile="na_all"><![CDATA[ <I>Drosophila</I> STSs from the BDGP and EDGP mapping projects ]]> </db> <db name="na_te" title=" Transposons (NT)" nalfile="na_all"><![CDATA[ a curated set of <i>${species}</i> transposable elements ]]> </db> </opt> --- NEW FILE: sgdbulk1.xml --- <opt name="sgdbulk1" relid="1" ROOT="${GMOD_ROOT}/" TMP="${GMOD_ROOT}/tmp" datadir="genomes/Saccharomyces_cerevisiae" > <title>SGD Lite rel 1</title> <doc name="README"><![CDATA[ Genome file output for Chado genome databases Database: SGD Lite rel 1 Software: Bio::GMOD::Bulkfiles Chado genome databases available (aug 2004) are ftp://flybase.net/genomes/Drosophila_melanogaster/current/pgsql/chado_r3_2_26_s.gz http://sgdlite.princeton.edu/download/sgdlite/2004_05_19_sgdlite.sql.gz QUICK TEST: # get soft cvs -d :pserver:ano...@cv...:/cvsroot/gmod \ co -d GMODTools schema/GMODTools # load chado db to Postgres createdb sgdlite_20040519 wget http://sgdlite.princeton.edu/download/sgdlite/2004_05_19_sgdlite.sql.gz (zcat *sgdlite.sql.gz | psql -d sgdlite_20040519 -f - ) >& log.load # set root path to here and make bulkfiles env GMOD_ROOT=$PWD ARGOS_ROOT=$PWD \ perl -I./GMODTools/lib/ GMODTools/bin/bulkfiles.pl sgdbulk1 ]]></doc> <doc name="Test-results.txt"><![CDATA[ Test for GMOD::Bulkfile processor aug 04, d.gilbert dghome2% ls $sgr /bio/biodb/flybase/data2/fban/sgdlite_20040519: chadofeat-summary.txt fasta/ gff/ tmp/ dna/ fff/ gnomap/ dghome2% du $sgr 24052 /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna 9928 /bio/biodb/flybase/data2/fban/sgdlite_20040519/fasta 944 /bio/biodb/flybase/data2/fban/sgdlite_20040519/fff 924 /bio/biodb/flybase/data2/fban/sgdlite_20040519/gff 0 /bio/biodb/flybase/data2/fban/sgdlite_20040519/gnomap 3980 /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump 3980 /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp 39836 /bio/biodb/flybase/data2/fban/sgdlite_20040519 # Summary of features for Saccharomyces cerevisiae from SGD Chado DB Lite r1 [20040519] # ALL chromosomes ARS 59 CDS 7050 centromere 16 chromosome 17 gene 13645 long_terminal_repeat 382 ncRNA 14 non_transcribed_region 3 noncoding_exon 466 rRNA 56 region 78 repeat_family 110 retrotransposon 50 snRNA 14 snoRNA 135 tRNA 347 telomere 32 transcribed_spacer_region 16 transposable_element_gene 100 == process log == DBI->connect( dbi:Pg:dbname=sgdlite_20040519;host=localhost;port=7302 ) do sql views view sql dump chromosomes feature_table /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump/ chromosomes.tsv sql dump chromosomes n rows=17 sql dump features feature_table /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/featdump/fea tures.tsv sql dump features n rows=22573 sortNSplitByChromosome: dumpChromosomeBases /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna/scer_chrI_dna_sgdr1.raw ... makeFiles: outformats= fff gff fasta openInput: type=feature/table part=0 openInput: name=chrI, type=feature/table, /bio/biodb/flybase/data2/fban/sgdlite_20040519/tmp/fe atdump/chadofeat-chrI.tsv # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/fff/scer_chrI_sgdr1.fff (append=0) # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/gff/scer_chrI_sgdr1.gff (append=0) # output /bio/biodb/flybase/data2/fban/sgdlite_20040519/fasta/scer_chrI_sgdr1.fasta (append=0) putFeats n=11, total=11, oid1=126750 open dnafile /bio/biodb/flybase/data2/fban/sgdlite_20040519/dna/scer_chrI_dna_sgdr1.raw, length =230210 putFeats n=17, total=25, oid1=126761 processChadoTable ndone = 1737 openInput: type=feature/table part=18 openInput: nothing matches part=18 makeFiles: done ]]></doc> <release id="1" rel="sgdr1" dbname="sgdlite_20040519" date="20040519" relfull="sgdlite_20040519" release_url="/annot/sgdlite-release1.html" /> <release id="0" rel="sgdr1" dbname="sgdlite_20040519" date="20040519" relfull="sgdlite_20040519" release_url="/annot/sgdlite-release1.html" /> <!-- need two+ release entries for config reader .. --> <db driver="Pg" name="sgdlite_20040519" host="localhost" port="7302" user="" password="" /> <org>scer</org> <species>Saccharomyces cerevisiae</species> <dnadump path="dna/dna-\w+.raw" sql="select feature_id, residues from feature where uniquename = ?" /> <featdump path="tmp/featdump/\w+.tsv" config="chadofeatsql" tag="feature_sql" type="feature_table" splitname="chadofeat" > <target>chromosomes</target> <target>features</target> <!-- //none in sgdlite1// <target>matches</target> --> <!-- //none in sgdlite1// <target>analysis</target> --> <!-- ? use this for sql arguments ? --> <!-- <target name="chromosomes" arg1="chromosome,chromosome_arm"/> --> </featdump> <fileset name="dna" path="dna/[\w\-\_]+.raw" sql="select feature_id, residues from feature where uniquename = ?" title="Chromosome dna" /> <fileset name="fff" path="fff/[\w\-\_]+.fff" title="FFF Genome features" config="sgdfeatconf" handler="FeatureWriter" dogzip="1" /> <fileset name="gff" path="gff/[\w\-\_]+.gff" title="GFF Genome features" dogzip="1" /> <fileset name="gnomap" path="gnomap/[\w\-\_]+.tsv" config="tognomap" handler="GnomapWriter" title="GnoMap features" /> <fileset name="gbrowse" path="gnomap/gbrowse.conf" config="gbrowseconf" handler="GnomapWriter" withvars="1" /> <fileset name="fasta" path="fasta/[\w\-\_]+.fasta" config="tofasta" handler="FastaWriter" dropnotes="synonym_2nd,synonym" makeall="1" perchr="0" dogzip="1" title="Genome feature sequence fasta" /> <fileset name="blast" path="blast/[\w\-\_]+.*" config="blastfiles" handler="BlastWriter" title="Blast indices" /> <idpattern></idpattern> <!-- feature sets to make fasta bulk files --> <featset>chromosome</featset> <featset>gene</featset> <featset>CDS</featset> <!-- <featset>translation</featset> // no translations in sgdlite db --> <featset>tRNA</featset> <featset>miscRNA</featset> <featset>transposon</featset> <featset>gene_extended2000</featset> <featset>intergenic</featset> <!-- featmap moved to common bulkfiles/chadofeatconv but can override here if desired to add/replace/delete --> <!-- sgd variant for this feature --> <featmap name="transposon" types="transposable_element_gene" typelabel="transposable_element" /> </opt> --- NEW FILE: fbbulk-r3h.xml --- <opt name="fbbulk-r3h" relid="h3" ROOT="${GMOD_ROOT}/" TMP="${GMOD_ROOT}/tmp" datadir="genomes/Drosophila_melanogaster" > <title>DHGP/FlyBase Heterochromatin rel 3.2</title> <doc name="README"> D. melanogaster heterochomatin genome data from Drosophila Heterochromatin Genome Project, www.dhgp.org Release 3.2 See http://flybase.net/annot/dmel_het_release3.2.txt </doc> <!-- this could/should be from include flybase-release.xml want default release/db xml that this overrides where needed? --> <release id="3" rel="r3.2.1" relfull="dmel_r3.2.1_07212004" dbname="chado_r3_2_27" date="20040804" release_url="/annot/release3.2.1.html" /> <release id="h3" rel="r3_2h" dbname="chadohet_081604" relfull="dmel_hetr3_2_08162004" date="20040821" release_url="/annot/het-release3.2.html" /> <db driver="Pg" name="dmelhet_chado" host="bugbane.bio.indiana.edu" port="7302" user="flybase" password="" /> <org>dmel</org> <species>Drosophila melanogaster</species> <dnadump path="dna/[\w\-\_]+.raw" sql="select feature_id, residues from feature where uniquename = ?" type="dna_dump" /> <featdump path="tmp/featdump/\w+.tsv" config="chadofeatsql" tag="feature_sql" type="feature_table" splitname="chadofeat" > <!-- <target>fbids</target> // accessory table ; fixme --> <target>chromosomes</target> <target>features</target> <target>analysis</target> <!-- <target>matches</target> --> <!-- drop matches for analysis which has same/better info hetr32 matches has only generic 'alignment' and 'sim4:wrap...' --> </featdump> <fileset name="feature_table" path="tmp/featdump/\w+.tsv" config="chadofeatsql" tag="feature_sql" splitname="chadofeat" > <target>chromosomes</target> <target>features</target> <target>analysis</target> </fileset> <!-- fileset are output formats w/ config, perl handler, location, etc. --> <fileset name="dna" path="dna/[\w\-\_]+.raw" input="dna_dump" sql="select feature_id, residues from feature where uniquename = ?" title="Chromosome dna" /> <fileset name="fff" path="fff/[\w\-\_]+.fff" input="feature_table" title="FFF Genome features" config="dmelhetfeatconv" handler="FeatureWriter" dogzip="1" /> <fileset name="gff" path="gff/[\w\-\_]+.gff" title="GFF Genome features" dogzip="1" /> <fileset name="gnomap" path="gnomap/[\w\-\_]+.tsv" input="fff" config="tognomap" handler="GnomapWriter" title="GnoMap features" indexonly="0" /> <fileset name="gbrowse" path="gnomap/gbrowse.conf" config="gbrowseconf" handler="GnomapWriter" withvars="1" /> <fileset name="fasta" path="fasta/[\w\-\_]+.fasta" input="fff" config="tofasta" handler="FastaWriter" dropnotes="synonym_2nd,synonym" makeall="1" perchr="1" dogzip="1" title="Genome feature sequence fasta" /> <fileset name="blast" path="blast/[\w\-\_]+.*" input="fasta" config="blastfiles" handler="BlastWriter" title="Blast indices" /> <!-- <fileset --> <!-- name="luseq" --> <!-- input="fasta" --> <!-- path="indices/lucene/seqs/.*" --> <!-- title="lucene sequence indices" --> <!-- config="lucegene" --> <!-- handler="LucegeneWriter" --> <!-- /> --> <!-- <fileset --> <!-- name="lufeat" --> <!-- input="fff" --> <!-- path="indices/lucene/gnomap/.*" --> <!-- config="lucegene" --> <!-- handler="LucegeneWriter" --> <!-- title="lucene map indices" --> <!-- /> --> <!-- use to add parent ids to features .. change tag parentidpattern ? --> <idpattern>(FBgn|FBti)\d+</idpattern> <!-- feature sets to make fasta bulk files .. change tag to fastaset ? .. add intergene set ? --> <featset>chromosome</featset> <!-- <featset>scaffold</featset> ? no het scaffolds? --> <featset>gene</featset> <featset>mRNA</featset> <featset>CDS</featset> <featset>transcript</featset> <featset>translation</featset> <featset>tRNA</featset> <featset>miscRNA</featset> <featset>transposon</featset> <featset>pseudogene</featset> <featset>gene_extended2000</featset> <featset>intergenic</featset> <!-- no het gmodel parts ! --> <!-- <featset>five_prime_UTR</featset> --> <!-- <featset>three_prime_UTR</featset> --> <!-- <featset>intron</featset> --> <!-- see featmap in chadofeatconf --> </opt> --- NEW FILE: gbrowseconf.xml --- <opt name="gbrowseconf" date="20040826" > <!-- need to call XML::Simple w/ ${variables} below in doc ${species} ${relfull} ${date} ${datapath} ${default_location} ${chromosomes} == ${examples} ?? readConfig( 'gbrowseconfig', { Variables => \%featvars }); --> <title>Gbrowse conf generator</title> <about> </about> <!-- add gbrowse.conf file parts here; as per blastfiles.doc --> <doc name="dummy"></doc> <doc name="gbrowse" path="gnomap/gbrowse.conf"> <!-- no content tag here --> <header title="header"><![CDATA[ # gbrowse config file for genome maps [GENERAL] description = ${species} ${relfull} ${date} datapath = ${gnomapdir} browser title = Genome Browser help = /maps/gbrowse # adaptor = flybase::gmod::FFFdb adaptor = default_name = ${default_location} # X:100000-200000 default_range = 1-500000 # examples to show in the introduction examples = ${examples} header = Genome Browser footer = <hr> Adapted from GBrowse of the <a href="http://www.gmod.org/"> Generic Model Organism Database Project.</a> debug = 0 plugins = BatchDumper FastaDumper #fixme ... default features = rev_ruler cytoband gene tRNA noncodingRNA golden_path segment BAC transposable_element pseudogene primary_feature = gene scale_feature = cytoband dumpviews = FastA GenBank GFF dataviews = "Default" "Collapse All" "Expand All" # Web site configuration info stylesheet = /templates/gbrowse.css buttons = /common/perl/gbrowse_fb/images/buttons tmpimages = /tmp/gbrowse_fb # max and default segment sizes for detailed view max segment = 1000001 default segment = 100000 ## base range to expand around gene given ID lookup expand_by = 20000 zoom levels = 100 200 1000 2000 5000 10000 20000 40000 100000 200000 500000 1000000 searchhelp = Search using Chromosome:base_start..end or FlyBase Gene ID. #panel property pad_left = 20 pad_right = 30 key_style = bottom key bgcolor = whitesmoke grid = 1 overview bgcolor = whitesmoke # # where to link to when user clicks in detaild view link = sub { my $f= shift; my $d= $f->primary_id() if $f->can('primary_id'); $d= $f->source() unless($d); if ($d) { if ($d =~ /(FB\w\w\d+)/){ return '/cgi-bin/fbidq.html?'.$1; } elsif ($d =~ /^C[GR]/ ){ return '/cgi-bin/fbannq.html?acc='.$d; } elsif ($d =~ /\-\w+/ ){ return '/cgi-bin/fbannq.html?'.$d; } } if ($f->name()) { return '/cgi-bin/fbsymq.html?'.$f->name(); } return '' ; } title = sub { my $f= shift; return $f->class .":". $f->info . " ". $f->seq_id .':'.$f->start."..".$f->end; } # "automatic" classes to try when an unqualified identifier is given automatic classes = Symbol Gene Clone # put reversed features on same track or below ? mix_strand = 1 # Default glyph settings glyph = generic height = 5 bgcolor = palegoldenrod fgcolor = cyan boxcolor = blue label density = 20 bump density = 50 overview label density = 160 image widths = 450 640 800 950 1024 default width = 800 width = 800 #---------- end [GENERAL] ------------------------ ]]></header> <footer title="footer"><![CDATA[ ]]></footer> </doc> <fdef name="GENERIC"><![CDATA[ [GENERIC] feature = GENERIC glyph = segments bgcolor = lightslategray fgcolor = black key = GENERIC citation = GENERIC keygroup = "Analysis features" link = ]]></fdef> <fdef name="rev_ruler"><![CDATA[ [rev_ruler] glyph = ruler_arrow label = 0 no_53_label = 1 label_align = center tick = 1 no_tick_label = 0 units = K link = key = Base ruler keygroup = " Genomic features" ]]></fdef> <!-- ============ Genomic features =========== --> <fdef name="cytoband"><![CDATA[ [cytoband] feature = cytoband glyph = revcomp_arrow both = 1 label_align = center fgcolor = black bgcolor = black linewidth = 2 label = 1 label density = 10000 bump = 0 bump density = 0 key = Cytologic band keygroup = " Genomic features" citation = Cytological bands on the polytene chromosomes link = sub { my $f=shift; my $u=$ENV{REQUEST_URI}; my $r= $f->ref.":".$f->to_FTstring; $u =~ s/\?.*$//; $u .= '?name='.$r.";doexpand=1"; return $u; } ]]></fdef> <fdef name="dna"><![CDATA[ [dna] feature = source glyph = dnabases draw_dna = 1 strand = forward label = 0 label density = 0 key = DNA sequence keygroup = " Genomic features" link = ]]></fdef> <fdef name="gene" feature="gene mRNA"><![CDATA[ [gene] feature = gene mRNA glyph = transcript2 bgcolor = lightblue fgcolor = blue secondary = mRNA secondary_bgcolor = mediumblue secondary_fgcolor = mediumblue highlight_color = red higlighted = 1 label = 1 label density = 50 key = Gene Model keygroup = " Genomic features" citation = Gene and mRNA (transcript) features (annotation DB ; Chado) link = sub { my $f= shift; my $d= $f->primary_id() if $f->can('primary_id'); $d= $f->source() unless($d); if ($d) { if ($d =~ /(FB\w\w\d+)/){ return '/cgi-bin/fbidq.html?'.$1; } elsif ($d =~ /^C[GR]/ ){ return '/cgi-bin/fbannq.html?acc='.$d; } } if ($f->name()) { return '/cgi-bin/fbgenq.html?symbol='.$f->name(); } return '' ; } ]]></fdef> <fdef name="CDS"><![CDATA[ [CDS] feature = CDS glyph = segments stranded = 1 translation = 1frame bgcolor = palegoldenrod fgcolor = goldenrod label = 1 key = CDS citation = Protein coding sequence (CDS or translation; annotation DB ; Chado) keygroup = " Genomic features" link = ]]></fdef> <fdef name="pseudogene"><![CDATA[ [pseudogene] feature = pseudogene glyph = segments strand_arrow = 1 fgcolor = orange label = 1 key = Pseudogene keygroup = " Genomic features... [truncated message content] |