From: <sco...@us...> - 2009-12-16 17:15:37
|
Revision: 22422 http://gmod.svn.sourceforge.net/gmod/?rev=22422&view=rev Author: scottcain Date: 2009-12-16 17:15:06 +0000 (Wed, 16 Dec 2009) Log Message: ----------- adding a --fastafile option to gmod_bulk_load_gff3.pl Modified Paths: -------------- schema/trunk/chado/Changes schema/trunk/chado/bin/gmod_gff3_preprocessor.pl schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm schema/trunk/chado/load/bin/bulk_load_gff3.PLS schema/trunk/chado/modules/companalysis/companalysis.sql Modified: schema/trunk/chado/Changes =================================================================== --- schema/trunk/chado/Changes 2009-12-16 15:38:43 UTC (rev 22421) +++ schema/trunk/chado/Changes 2009-12-16 17:15:06 UTC (rev 22422) @@ -47,9 +47,10 @@ * Adding the ability to specify that gmod_bulk_load_gff3.pl use the system tmp directory rather than using the current working directory for writing the temporary load files. +* Added a --fastafile option to gmod_bulk_load_gff3.pl to use when +loading fasta files. - Below this line are changes up until a few years ago and are included for historical purposes. --------------------------------------------------------------------------- Modified: schema/trunk/chado/bin/gmod_gff3_preprocessor.pl =================================================================== --- schema/trunk/chado/bin/gmod_gff3_preprocessor.pl 2009-12-16 15:38:43 UTC (rev 22421) +++ schema/trunk/chado/bin/gmod_gff3_preprocessor.pl 2009-12-16 17:15:06 UTC (rev 22422) @@ -242,9 +242,9 @@ open FASTA, ">", $fasta or die "couldn't open $fasta for writing: $!\n"; - print FASTA "##gff-version 3\n"; - print FASTA; - print FASTA "\n"; #extra cr works around bug in Bio::FeatureIO::gff + #print FASTA "##gff-version 3\n"; + #print FASTA; + #print FASTA "\n"; #extra cr works around bug in Bio::FeatureIO::gff next; } elsif ($fasta_flag) { Modified: schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm =================================================================== --- schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm 2009-12-16 15:38:43 UTC (rev 22421) +++ schema/trunk/chado/lib/Bio/GMOD/DB/Adapter.pm 2009-12-16 17:15:06 UTC (rev 22422) @@ -1575,16 +1575,12 @@ my $sth = $self->dbh->prepare($max_id_query); $sth->execute; my ($max_id) = $sth->fetchrow_array(); - - warn "$table $max_id"; my $curval_query = "SELECT nextval('$sequences{$table}')"; $sth = $self->dbh->prepare($curval_query); $sth->execute; my ($curval) = $sth->fetchrow_array(); - warn "$table $curval"; - if ($max_id > $curval) { my $setval_query = "SELECT setval('$sequences{$table}',$max_id)"; $sth = $self->dbh->prepare($setval_query); @@ -2544,6 +2540,41 @@ return; } +sub print_fasta { + my $self = shift; + my ($uniquename,$string) = @_; + my $dbh = $self->dbh; + my $organism_id = $self->organism_id; + + #assume that the fasta ID matches the uniquename (ie, no munging when it went into the database) + my $fid_query = "SELECT feature_id FROM feature WHERE uniquename = ? AND organism_id = ?"; + my $sth = $dbh->prepare($fid_query); + $sth->execute($uniquename,$organism_id); + + if ($sth->rows == 0) { + warn <<END; +No features where found with a unqiuename of $uniquename +and an organism_id of $organism_id. Are you sure you have the uniquename +right? It might have been changed when loaded into the database to ensure +uniqueness. Skipping this sequence... + +END + return; + } + elsif ($sth->rows > 1) { + warn "More than one feature found for $uniquename, org_id:$organism_id when trying to add sequence, skipping...\n\n"; + return; + } + + my ($feature_id) = $sth->fetchrow_array; + + my $fh = $self->file_handles('sequence'); + print $fh "UPDATE feature set residues='$string' WHERE feature_id=$feature_id;\n"; + print $fh "UPDATE feature set seqlen=length(residues) WHERE feature_id=$feature_id;\n"; + + return; +} + sub print_af { my $self = shift; my ($af_id,$f_id,$a_id,$score) = @_; Modified: schema/trunk/chado/load/bin/bulk_load_gff3.PLS =================================================================== --- schema/trunk/chado/load/bin/bulk_load_gff3.PLS 2009-12-16 15:38:43 UTC (rev 22421) +++ schema/trunk/chado/load/bin/bulk_load_gff3.PLS 2009-12-16 17:15:06 UTC (rev 22422) @@ -36,6 +36,7 @@ use warnings; use lib '/Users/cain/cvs_stuff/schema/trunk/chado/lib'; use Bio::FeatureIO; +use Bio::SeqIO; use Getopt::Long; use Data::Dumper; use Pod::Usage; @@ -58,6 +59,7 @@ --gfffile The file containing GFF3 (optional, can read from stdin) + --fastafile Fasta file to load sequence from --organism The organism for the data (use the value 'fromdata' to read from GFF organism=xxx) --dbprofile Database config profile name @@ -336,6 +338,18 @@ =over +=item Loading fasta file + +When the --fastafile is provided with an argument that is the path to +a file containing fasta sequence, the loader will attempt to update the +feature table with the sequence provided. Note that the ID provided in the +fasta description line must exactly match what is in the feature table +uniquename field. Be careful if it is possible that the uniquename of the +feature was changed to ensure uniqueness when it was loaded from the +original GFF. Also note that when loading sequence from a fasta file, +loading GFF from standard in is disabled. Sorry for any inconvenience. + + =item ##sequence-region This script does not use sequence-region directives for anything. @@ -546,7 +560,7 @@ =cut -my ($ORGANISM, $GFFFILE,$DBPROFILE, $DBNAME, $DBUSER, $DBPASS,$DBHOST, $DBPORT, +my ($ORGANISM, $GFFFILE,$FASTAFILE,$DBPROFILE, $DBNAME, $DBUSER, $DBPASS,$DBHOST, $DBPORT, $ANALYSIS, $ANALYSIS_GROUP, $GLOBAL_ANALYSIS, $NOLOAD, $VALIDATE, $INSERTS, $NOTRANSACT, $NOSEQUENCE, $SCORE_COL, $ONTOLOGY, $SKIP_VACUUM, $DROP_INDEX, $NOEXON, $NOUNIQUECACHE, $RECREATE_CACHE, $SAVE_TMPFILES,$RANDOM_TMP_DIR, @@ -557,6 +571,7 @@ GetOptions( 'organism=s' => \$ORGANISM, 'gfffile=s' => \$GFFFILE, + 'fastafile=s'=> \$FASTAFILE, 'dbprofile=s'=> \$DBPROFILE, 'dbname=s' => \$DBNAME, 'dbuser=s' => \$DBUSER, @@ -660,6 +675,7 @@ my %argv; $argv{organism} = $ORGANISM; $argv{gfffile} = $GFFFILE; + $argv{fastafile} = $FASTAFILE; $argv{dbprofile} = $DBPROFILE; $argv{dbname} = $DBNAME; $argv{dbuser} = $DBUSER; @@ -730,11 +746,11 @@ $chado->file_handles(); my $gffio; -if ($GFFFILE eq 'stdin') { +if ($GFFFILE eq 'stdin' and !$FASTAFILE) { $gffio = Bio::FeatureIO->new(-fh => \*STDIN , -format => 'gff', -validate_terms => $VALIDATE); -} else { +} elsif ($GFFFILE and $GFFFILE ne 'stdin') { $gffio = Bio::FeatureIO->new(-file => $GFFFILE, -format => 'gff', -validate_terms => $VALIDATE); @@ -755,7 +771,7 @@ $chado->organism_id($ORGANISM) or die "$ORGANISM organism not found in the database"; } -elsif ($gffio->organism) { +elsif (defined $gffio && $gffio->organism) { $ORGANISM = $gffio->organism; $chado->organism($ORGANISM); $chado->organism_id($ORGANISM) @@ -786,7 +802,8 @@ my $feature_iterator; my $itern=0; my %seen_organism; FEATURE: while(my $feature = - (defined $feature_iterator && $feature_iterator->next_feature) || $gffio->next_feature){ + (defined $feature_iterator && $feature_iterator->next_feature) + || (defined $gffio && $gffio->next_feature)){ $chado->primary_dbxref(''); my $featuretype = $feature->type->name; @@ -1052,7 +1069,7 @@ #$validate_uniquename->finish; #deal with sequence -unless ($NOSEQUENCE) { +unless ($NOSEQUENCE or !defined $gffio) { #ugh--reversed unless logic while (my $seq = $gffio->next_seq) { my $string = $seq->seq(); my $name = $seq->display_id(); @@ -1060,6 +1077,15 @@ } } +if ($FASTAFILE) { + #use SeqIO to parse the fasta file + my $in = Bio::SeqIO->new(-file => $FASTAFILE, + -format => 'fasta'); + while (my $seq = $in->next_seq) { + $chado->print_fasta($seq->display_id,$seq->seq); + } +} + $chado->flush_caches(); $chado->load_data() unless $NOLOAD; Modified: schema/trunk/chado/modules/companalysis/companalysis.sql =================================================================== --- schema/trunk/chado/modules/companalysis/companalysis.sql 2009-12-16 15:38:43 UTC (rev 22421) +++ schema/trunk/chado/modules/companalysis/companalysis.sql 2009-12-16 17:15:06 UTC (rev 22422) @@ -92,3 +92,13 @@ following semantics: * normscores are floating point numbers >= 0, * high normscores are better than low one. For most programs, it would be sufficient to make the normscore the same as this rawscore, providing these semantics are satisfied.'; COMMENT ON COLUMN analysisfeature.rawscore IS 'This is the native score generated by the program; for example, the bitscore generated by blast, sim4 or genscan scores. One should not assume that high is necessarily better than low.'; + + +CREATE TABLE analysisfeatureprop ( + analysisfeatureprop_id SERIAL PRIMARY KEY, + analysisfeature_id INTEGER NOT NULL REFERENCES analysisfeature(analysisfeature_id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + type_id INTEGER NOT NULL REFERENCES cvterm(cvterm_id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + value TEXT, + rank INTEGER NOT NULL, + CONSTRAINT analysisfeature_id_type_id_rank UNIQUE(analysisfeature_id, type_id, rank) +); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |