[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles drosmelgb.xml, 1.1, 1.2 filesets.xml, 1.10, 1.11

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/gmod/schema/GMODTools/conf/bulkfiles
In directory sc8-pr-cvs2.sourceforge.net:/tmp/cvs-serv21431/conf/bulkfiles

Modified Files:
	drosmelgb.xml filesets.xml genbanksubmit.xml 
Log Message:
genbank submit update: works with NCBI tbl2asn now

Index: filesets.xml
===================================================================
RCS file: /cvsroot/gmod/schema/GMODTools/conf/bulkfiles/filesets.xml,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** filesets.xml	12 May 2008 04:50:44 -0000	1.10
--- filesets.xml	14 May 2008 05:58:22 -0000	1.11
***************
*** 210,213 ****
--- 210,214 ----
      id="genbanktbl"
      path="genbanksubmit/"
+     suffix="tbl"
      handler="GenbankSubmitWriter"
      title="Genbank genome submit table"

Index: drosmelgb.xml
===================================================================
RCS file: /cvsroot/gmod/schema/GMODTools/conf/bulkfiles/drosmelgb.xml,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** drosmelgb.xml	13 May 2008 04:40:55 -0000	1.1
--- drosmelgb.xml	14 May 2008 05:58:22 -0000	1.2
***************
*** 30,34 ****
    <release_url>/genome/${species}/release-${release_id}.html</release_url>
  
!   <org>drosmelgb</org>
    <species>Drosophila_melanogaster</species>
  
--- 30,34 ----
    <release_url>/genome/${species}/release-${release_id}.html</release_url>
  
!   <org>dromel</org>
    <species>Drosophila_melanogaster</species>
  

Index: genbanksubmit.xml
===================================================================
RCS file: /cvsroot/gmod/schema/GMODTools/conf/bulkfiles/genbanksubmit.xml,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** genbanksubmit.xml	13 May 2008 04:40:55 -0000	1.2
--- genbanksubmit.xml	14 May 2008 05:58:22 -0000	1.3
***************
*** 2,21 ****
    name="genbanksubmit"
    date="20080511"
!   
    >
    
    <title>GenBank Submission table writer</title>
!   <about>
!     These are configurations for converting chado feature table dumps to
!     Genbank submission table files. Most of these configs specify how to
      process and convert various features extracted from chado db.
!     These are tied to methods in Bulkfiles::FeatureWriter.pm)
      See chadofeatconv.xml for similar methods.
  
!   Valid options include
! 
!     recodekey id={one of feature keys} value=output-key  
!     
!   </about>
    
    <!-- 
--- 2,26 ----
    name="genbanksubmit"
    date="20080511"
! 
!   tbl2asn="${ARGOS_ROOT}/common/servers/blast/Bin/tbl2asn" 
!   tbl2asnopts="-t template.sbt -V vb -p ./  " 
    >
    
    <title>GenBank Submission table writer</title>
!   
!   <about><![CDATA[
!     These are configurations for converting Chado genome features to
!     Genbank submission table files.  Most of these specify how to
      process and convert various features extracted from chado db.
!     These are tied to methods in Bulkfiles::GenbankSubmitWriter/FeatureWriter
      See chadofeatconv.xml for similar methods.
  
!     Refer to these specifications:
!     http://www.ncbi.nlm.nih.gov/Genbank/eukaryotic_genome_submission.html
!     http://www.ncbi.nlm.nih.gov/Genbank/eukaryotic_genome_submission_annotation.html
!     http://www.ncbi.nlm.nih.gov/Genbank/tbl2asn2.html
!     http://www.ncbi.nlm.nih.gov/Sequin/table.html
!     http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
!   ]]></about>
    
    <!-- 
***************
*** 23,31 ****
      set of rewrite patterns and logic lives that gbsub also
      needs; should separate out good from old there into two files.
-     
      need some of these: mapattr_pattern**; maptype_pattern; mapname_pattern; 
        ** feat_model
    -->
- 
    <include>chadofeatconv</include>
    
--- 28,34 ----
***************
*** 35,54 ****
    <outformats>dummy</outformats>
  
-   <!-- genbank tags from internal tags 
-    all ncRNA SO types become this
-    type ncRNA ; attr 'ncRNA_class	snRNA'
-   -->
    <mapattr_key id="Dbxref">db_xref</mapattr_key>
    <mapattr_key id="Note">note</mapattr_key>
-   <mapattr_key id="synonym">gene_syn</mapattr_key>
    <mapattr_key id="cyto_range">cyt_map</mapattr_key>
    <mapattr_key id="map">cyt_map</mapattr_key>
    <mapattr_key id="eC_number">EC_number</mapattr_key> <!-- fix a GFFism -->
!     
!     
!   <!-- Name is special case depending on feature type -->
!   <mapattr_key id="Name">gene</mapattr_key>
!   <!-- mapattr_key id="ID">locus_tag</mapattr_key -->
!   <!-- ditto for ID : can become locus_tag; transcript_id ; protein_id ... -->
  
    <!-- see maptype_gff; all the type recodes here .. ncRNA ??
--- 38,91 ----
    <outformats>dummy</outformats>
  
    <mapattr_key id="Dbxref">db_xref</mapattr_key>
    <mapattr_key id="Note">note</mapattr_key>
    <mapattr_key id="cyto_range">cyt_map</mapattr_key>
    <mapattr_key id="map">cyt_map</mapattr_key>
+   <mapattr_key id="species">organism</mapattr_key>
    <mapattr_key id="eC_number">EC_number</mapattr_key> <!-- fix a GFFism -->
!   <!-- mapattr_key id="synonym" >gene_syn</mapattr_key -->
!         
!   <!-- ID, Name, others are special cases depending on feature type;
!       see GenbankSubmitWriter::handleAttribOut()
!       try this mapattr_feature
!   -->
!   <mapattr_key id="ID">db_xref</mapattr_key>
!   <mapattr_key id="Name">name</mapattr_key>
! 
!   <mapattr_key_gene
!     ID="locus_tag"
!     Name="gene"
!     synonym="gene_syn"
!     locus_tag="old_locus_tag"
!     gene="old_gene"
!     />
!   <mapattr_key_mRNA match="RNA" 
!     ID="transcript_id"
!     Parent="locus_tag"
!     ParentName="gene"
!     locus_tag="old_locus_tag"
!     gene="old_gene"
!     Name="product"
!     transcript_id="old_transcript_id"
!     product="old_product"
!     />
!  <mapattr_key_CDS 
!     ID="protein_id"
!     Parent="skip"
!     ParentName="skip"
!     GrandParent="locus_tag"
!     GrandParentName="gene"
!     locus_tag="old_locus_tag"
!     gene="old_gene"
!     Name="product"
!     protein_id="old_protein_id"
!     product="old_product"
!     />
!   <mapattr_key_transposon
!     ID="transposon"
!     transposon="old_transposon"
!     />
! 
! 
  
    <!-- see maptype_gff; all the type recodes here .. ncRNA ??
***************
*** 128,132 ****
    </feat_model>
    
-   <!-- default structure for non-simple 'mRNA' types -->
    <!-- tRNA , rRNA are separate in Genbank ; equal to mRNA
    (snoRNA, scRNA, snRNA, miRNA, ncRNA, rRNA)
--- 165,168 ----
***************
*** 166,168 ****
--- 202,332 ----
  
  
+ 
+   <about id="submit_template_info"><![CDATA[
+ ref: ncbi/doc/tbl2asn.txt
+     
+ SUBMISSION TEMPLATE FORMAT
+ 
+ The submission template is an ASN.1 Submit-block that can be generated by
+ Sequin.  A simple example is shown below.
+ 
+    ]]></about>
+   
+   <doc id="submit_template" path="genbanksubmit/template.sbt">
+   <!-- FIXME: configure this or expect customers to edit to taste ? -->
+ <![CDATA[
+ 
+ Submit-block ::= {
+   contact {
+     contact {
+       name
+         name {
+           last "Darwin" ,
+           first "Charles" ,
+           initials "C.R." ,
+           suffix "" } ,
+       affil
+         std {
+           affil "Oxbridge University" ,
+           div "Evolutionary Biology Department" ,
+           city "Camford" ,
+           country "United Kingdom" ,
+           street "1859 Tennis Court Lane" ,
+           email "da...@be..." ,
+           phone "01 44 171-007-1212" ,
+           postal-code "OX1 2BH" } } } ,
+   cit {
+     authors {
+       names
+         std {
+           {
+             name
+               name {
+                 last "Darwin" ,
+                 first "Charles" ,
+                 initials "C.R." } } } ,
+       affil
+         std {
+           affil "Oxbridge University" ,
+           div "Evolutionary Biology Department" ,
+           city "Camford" ,
+           country "United Kingdom" ,
+           street "1859 Tennis Court Lane" ,
+           postal-code "OX1 2BH" } } ,
+     date
+       std {
+         year 2003 ,
+         month 2 ,
+         day 28 } } ,
+   subtype new  }
+   ]]> 
+   </doc>
+ 
+  <about id="working notes">
+  <![CDATA[
+  working notes:
+     * add NCBI tbl2asn executable options as per blastfiles.xml
+   tbl2asn="${ARGOS_ROOT}/common/servers/blast/Bin/tbl2asn" 
+   tbl2asnopts=" " 
+    e.g.  tbl2asn -t template.sbt -p path_to_files -a s -V v
+ 
+ # this works .. with errors
+ $nb/tbl2asn -t template.sbt -V vb -p ./  
+ 
+ GMODTools/data/genomes/Drosophila_melanogaster/drosmelgb_20080512/genbanktbl
+ 
+ # outputs
+ 2415212 May 13 19:50 drosmelgb-all-drosmelgb4.gbf
+ 3910617 May 13 19:50 drosmelgb-all-drosmelgb4.sqn
+  653387 May 13 19:50 drosmelgb-all-drosmelgb4.val == ERRORS list
+ 
+ # inputs
+  265366 May 13 19:48 drosmelgb-all-drosmelgb4.tbl
+ 1379004 May 13 19:31 drosmelgb-all-drosmelgb4.fsa == ../fasta/drosmelgb-all-chromosome-drosmelgb4.fasta
+  296419 May 13 19:30 drosmelgb-all-drosmelgb4.pep == ../fasta/drosmelgb-all-translation-drosmelgb4.fasta
+    1085 May 13 19:22 template.sbt
+ 
+ # prot translations mostly bad: no start_codon= field *** only problem w/ genbank2chado ?
+ # ** need to always explicity write this field from CDS_span location? shouldn't this be
+   shouldn't need this; CDS locs should have start/stop
+ 
+  	perl -pi -e's/protein_id\tNP/old_protein_id\tNP/;' drosmelgb-all-drosmelgb4.tbl
+  	perl -pi -e's/transcript_id\tNM_/old_transcript_id\tNM_/;' drosmelgb-all-drosmelgb4.tbl
+ 
+ # all prots.pep missed : fix >IDs ??
+ [NULL_Caption] Unable to find protein sequence CG17923.p01
+ SeqID must match protein_id in the .tbl file
+ 
+ 
+ # ERRORS .val................
+ 
+ ERROR: valid [SEQ_DESCR.NoOrgFound] No organism name anywhere on this entire record. BIOSEQ: lcl|NC_004353: ra
+ w, dna len= 1351857
+  >> change chromosome type to source type
+ 
+  	perl -pi -e's/1351857\tchromosome/1351857\tsource/;s/Drosophila_melanogaster/Drosophila melanogaster/;' drosmelgb-all-drosmelgb4.tbl
+ 
+ WARNING: valid [SEQ_FEAT.IllegalDbXref] db_xref type GI is only created by the flatfile generator, 
+   >> drop all db_xref == GI: 
+ 
+  	perl -pi -e's/db_xref\tGI:/db_xref\told_GI:/;' drosmelgb-all-drosmelgb4.tbl
+   
+ ERROR: valid [SEQ_FEAT.SeqLocOrder] Location: Intervals out of order in SeqLoc [(lcl|NC_004353:c53751-53434, c
+ 53999-53817, c57083-56500, c61911-57142, c64403-63540)] FEATURE: CDS: plexin B CG17245-PA [(lcl|NC_004353:c537
+ 51-53434, c53999-53817, c57083-56500, c61911-57142, c64403-63540)] [lcl|NC_004353: raw, dna len= 1351857] -> [
+ lcl|plexB.p01]
+   >> this is probably revcomp error; put last first ...
+ 
+ WARNING: valid [SEQ_FEAT.UnnecessaryGeneXref] Unnecessary gene cross-reference CG32005 FEATURE: CDS: CG32005-P
+ A [lcl|NC_004353:111840-113903] [lcl|NC_004353: raw, dna len= 1351857] -> [lcl|CG32005.p01]
+  >>  
+  111840  113903  CDS
+         protein_id      CG32005.p01
+         gene    CG32005     <<< is this the problem?
+         product CG32005-PA
+  
+  
+ ]]> </about>
+ 
+ 
  </opt>

[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles drosmelgb.xml, 1.1, 1.2 filesets.xml, 1.10, 1.11

[Gmod-schema-cmts] schema/GMODTools/conf/bulkfiles drosmelgb.xml, 1.1, 1.2 filesets.xml, 1.10, 1.11 genbanksubmit.xml, 1.2, 1.3