[Gmod-schema-cmts] SF.net SVN: gmod:[22422] schema/trunk/chado

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 22422
          http://gmod.svn.sourceforge.net/gmod/?rev=22422&view=rev
Author:   scottcain
Date:     2009-12-16 17:15:06 +0000 (Wed, 16 Dec 2009)

Log Message:
-----------
adding a --fastafile option to gmod_bulk_load_gff3.pl

Modified Paths:
--------------
    schema/trunk/chado/Changes
    schema/trunk/chado/bin/gmod_gff3_preprocessor.pl
    schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm
    schema/trunk/chado/load/bin/bulk_load_gff3.PLS
    schema/trunk/chado/modules/companalysis/companalysis.sql

Modified: schema/trunk/chado/Changes
===================================================================

--- schema/trunk/chado/Changes	2009-12-16 15:38:43 UTC (rev 22421)
+++ schema/trunk/chado/Changes	2009-12-16 17:15:06 UTC (rev 22422)
@@ -47,9 +47,10 @@
 * Adding the ability to specify that gmod_bulk_load_gff3.pl use the
 system tmp directory rather than using the current working directory
 for writing the temporary load files.
+* Added a --fastafile option to gmod_bulk_load_gff3.pl to use when 
+loading fasta files.
 
 
-
 Below this line are changes up until a few years ago and are included
 for historical purposes.
 ---------------------------------------------------------------------------

Modified: schema/trunk/chado/bin/gmod_gff3_preprocessor.pl
===================================================================
--- schema/trunk/chado/bin/gmod_gff3_preprocessor.pl	2009-12-16 15:38:43 UTC (rev 22421)
+++ schema/trunk/chado/bin/gmod_gff3_preprocessor.pl	2009-12-16 17:15:06 UTC (rev 22422)
@@ -242,9 +242,9 @@
             open FASTA, 
                  ">", $fasta or die "couldn't open $fasta for writing: $!\n";
 
-            print FASTA "##gff-version 3\n";
-            print FASTA;
-            print FASTA "\n";  #extra cr works around bug in Bio::FeatureIO::gff
+            #print FASTA "##gff-version 3\n";
+            #print FASTA;
+            #print FASTA "\n";  #extra cr works around bug in Bio::FeatureIO::gff
             next;
         }
         elsif ($fasta_flag) {

Modified: schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm
===================================================================
--- schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm	2009-12-16 15:38:43 UTC (rev 22421)
+++ schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm	2009-12-16 17:15:06 UTC (rev 22422)
@@ -1575,16 +1575,12 @@
       my $sth          = $self->dbh->prepare($max_id_query);
       $sth->execute;
       my ($max_id)     = $sth->fetchrow_array();
-
-       warn "$table $max_id";
       
       my $curval_query = "SELECT nextval('$sequences{$table}')";
       $sth             = $self->dbh->prepare($curval_query);
       $sth->execute;
       my ($curval)     = $sth->fetchrow_array();      
 
-       warn "$table $curval";
-
       if ($max_id > $curval) {
           my $setval_query = "SELECT setval('$sequences{$table}',$max_id)";
           $sth             = $self->dbh->prepare($setval_query);
@@ -2544,6 +2540,41 @@
   return;
 }
 
+sub print_fasta {
+  my $self = shift;
+  my ($uniquename,$string) = @_;
+  my $dbh = $self->dbh;
+  my $organism_id = $self->organism_id;
+
+  #assume that the fasta ID matches the uniquename (ie, no munging when it went into the database)
+  my $fid_query = "SELECT feature_id FROM feature WHERE uniquename = ? AND organism_id = ?";
+  my $sth = $dbh->prepare($fid_query);
+  $sth->execute($uniquename,$organism_id);
+
+  if ($sth->rows == 0) {
+    warn <<END;
+No features where found with a unqiuename of $uniquename
+and an organism_id of $organism_id.  Are you sure you have the uniquename
+right?  It might have been changed when loaded into the database to ensure
+uniqueness.  Skipping this sequence...
+
+END
+    return;
+  } 
+  elsif ($sth->rows > 1) {
+    warn "More than one feature found for $uniquename, org_id:$organism_id when trying to add sequence, skipping...\n\n";
+    return;
+  }
+
+  my ($feature_id) = $sth->fetchrow_array;
+
+  my $fh = $self->file_handles('sequence');
+  print $fh "UPDATE feature set residues='$string' WHERE feature_id=$feature_id;\n";
+  print $fh "UPDATE feature set seqlen=length(residues) WHERE feature_id=$feature_id;\n";
+
+  return;
+}
+
 sub print_af {
   my $self = shift;
   my ($af_id,$f_id,$a_id,$score) = @_;

Modified: schema/trunk/chado/load/bin/bulk_load_gff3.PLS
===================================================================
--- schema/trunk/chado/load/bin/bulk_load_gff3.PLS	2009-12-16 15:38:43 UTC (rev 22421)
+++ schema/trunk/chado/load/bin/bulk_load_gff3.PLS	2009-12-16 17:15:06 UTC (rev 22422)
@@ -36,6 +36,7 @@
 use warnings;
 use lib '/Users/cain/cvs_stuff/schema/trunk/chado/lib';
 use Bio::FeatureIO;
+use Bio::SeqIO;
 use Getopt::Long;
 use Data::Dumper;
 use Pod::Usage;
@@ -58,6 +59,7 @@
 
  --gfffile         The file containing GFF3 (optional, can read 
                      from stdin)
+ --fastafile       Fasta file to load sequence from
  --organism        The organism for the data 
                     (use the value 'fromdata' to read from GFF organism=xxx)
  --dbprofile       Database config profile name
@@ -336,6 +338,18 @@
 
 =over
 
+=item Loading fasta file
+
+When the --fastafile is provided with an argument that is the path to
+a file containing fasta sequence, the loader will attempt to update the
+feature table with the sequence provided.  Note that the ID provided in the
+fasta description line must exactly match what is in the feature table
+uniquename field.  Be careful if it is possible that the uniquename of the
+feature was changed to ensure uniqueness when it was loaded from the
+original GFF.  Also note that when loading sequence from a fasta file, 
+loading GFF from standard in is disabled.  Sorry for any inconvenience.
+
+
 =item ##sequence-region
 
 This script does not use sequence-region directives for anything.
@@ -546,7 +560,7 @@
 
 =cut
 
-my ($ORGANISM, $GFFFILE,$DBPROFILE, $DBNAME, $DBUSER, $DBPASS,$DBHOST, $DBPORT, 
+my ($ORGANISM, $GFFFILE,$FASTAFILE,$DBPROFILE, $DBNAME, $DBUSER, $DBPASS,$DBHOST, $DBPORT, 
     $ANALYSIS, $ANALYSIS_GROUP, $GLOBAL_ANALYSIS, $NOLOAD, $VALIDATE, $INSERTS,
     $NOTRANSACT, $NOSEQUENCE, $SCORE_COL, $ONTOLOGY, $SKIP_VACUUM,
     $DROP_INDEX, $NOEXON, $NOUNIQUECACHE, $RECREATE_CACHE, $SAVE_TMPFILES,$RANDOM_TMP_DIR,
@@ -557,6 +571,7 @@
 GetOptions(
     'organism=s' => \$ORGANISM,
     'gfffile=s'  => \$GFFFILE,
+    'fastafile=s'=> \$FASTAFILE,
     'dbprofile=s'=> \$DBPROFILE,
     'dbname=s'   => \$DBNAME,
     'dbuser=s'   => \$DBUSER,
@@ -660,6 +675,7 @@
 my %argv;
   $argv{organism}     = $ORGANISM;
   $argv{gfffile}      = $GFFFILE;
+  $argv{fastafile}    = $FASTAFILE;
   $argv{dbprofile}    = $DBPROFILE;
   $argv{dbname}       = $DBNAME;
   $argv{dbuser}       = $DBUSER;
@@ -730,11 +746,11 @@
 $chado->file_handles();
 
 my $gffio;
-if ($GFFFILE eq 'stdin') {
+if ($GFFFILE eq 'stdin' and !$FASTAFILE) {
     $gffio = Bio::FeatureIO->new(-fh   => \*STDIN , 
                                  -format => 'gff', 
                                  -validate_terms => $VALIDATE);
-} else {
+} elsif ($GFFFILE and $GFFFILE ne 'stdin') {
     $gffio = Bio::FeatureIO->new(-file => $GFFFILE, 
                                  -format => 'gff', 
                                  -validate_terms => $VALIDATE);
@@ -755,7 +771,7 @@
     $chado->organism_id($ORGANISM)
        or die "$ORGANISM organism not found in the database";
 }
-elsif ($gffio->organism) {
+elsif (defined $gffio && $gffio->organism) {
     $ORGANISM = $gffio->organism;
     $chado->organism($ORGANISM);
     $chado->organism_id($ORGANISM)
@@ -786,7 +802,8 @@
 my $feature_iterator; my $itern=0; my %seen_organism;
 FEATURE:
 while(my $feature = 
-  (defined $feature_iterator && $feature_iterator->next_feature) || $gffio->next_feature){
+  (defined $feature_iterator && $feature_iterator->next_feature) 
+    || (defined $gffio && $gffio->next_feature)){
   $chado->primary_dbxref('');
   
   my $featuretype = $feature->type->name;
@@ -1052,7 +1069,7 @@
 #$validate_uniquename->finish;
 
 #deal with sequence 
-unless ($NOSEQUENCE) {
+unless ($NOSEQUENCE or !defined $gffio) { #ugh--reversed unless logic
   while (my $seq = $gffio->next_seq) {
     my $string = $seq->seq();
     my $name   = $seq->display_id();
@@ -1060,6 +1077,15 @@
   }
 }
 
+if ($FASTAFILE) {
+    #use SeqIO to parse the fasta file
+    my $in = Bio::SeqIO->new(-file   => $FASTAFILE,
+                             -format => 'fasta');
+    while (my $seq = $in->next_seq) {
+        $chado->print_fasta($seq->display_id,$seq->seq);
+    }
+}
+
 $chado->flush_caches();
 
 $chado->load_data() unless $NOLOAD;

Modified: schema/trunk/chado/modules/companalysis/companalysis.sql
===================================================================
--- schema/trunk/chado/modules/companalysis/companalysis.sql	2009-12-16 15:38:43 UTC (rev 22421)
+++ schema/trunk/chado/modules/companalysis/companalysis.sql	2009-12-16 17:15:06 UTC (rev 22422)
@@ -92,3 +92,13 @@
     following semantics: * normscores are floating point numbers >= 0,
     * high normscores are better than low one. For most programs, it would be sufficient to make the normscore the same as this rawscore, providing these semantics are satisfied.';
 COMMENT ON COLUMN analysisfeature.rawscore IS 'This is the native score generated by the program; for example, the bitscore generated by blast, sim4 or genscan scores. One should not assume that high is necessarily better than low.';
+
+
+CREATE TABLE analysisfeatureprop (
+    analysisfeatureprop_id SERIAL PRIMARY KEY,
+    analysisfeature_id INTEGER NOT NULL REFERENCES analysisfeature(analysisfeature_id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+    type_id INTEGER NOT NULL REFERENCES cvterm(cvterm_id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+    value TEXT,
+    rank INTEGER NOT NULL,
+    CONSTRAINT analysisfeature_id_type_id_rank UNIQUE(analysisfeature_id, type_id, rank)
+);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.