From: Scott C. <sco...@us...> - 2005-05-19 14:43:00
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv2859/load/bin Modified Files: bulk_load_gff3.PLS Log Message: adding docs and Gap handling to bulk loader Index: bulk_load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/bulk_load_gff3.PLS,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** bulk_load_gff3.PLS 14 May 2005 17:17:49 -0000 1.34 --- bulk_load_gff3.PLS 19 May 2005 14:42:51 -0000 1.35 *************** *** 69,77 **** --ontology Give directions for handling misc Ontology_terms ! Note that all of the arguments that begin 'db' can be provided by default ! by Bio::GMOD::Config, which was installed when 'make install' was run. =head1 DESCRIPTION =head2 NOTES --- 69,238 ---- --ontology Give directions for handling misc Ontology_terms ! Note that all of the arguments that begin 'db' as well as organism can ! be provided by default by Bio::GMOD::Config, which was installed when ! 'make install' was run. =head1 DESCRIPTION + The GFF in the datafile must be version 3 due to its tighter control of + the specification and use of controlled vocabulary. Accordingly, the names + of feature types must be exactly those in the Sequence Ontology Feature + Annotation (SOFA), not the synonyms and not the accession numbers (SO + accession numbers may be supported in future versions of this script). + + =head2 How GFF3 is stored in chado + + Here is summary of how GFF3 data is stored in chado: + + =over + + =item Column 1 (reference sequence) + + The reference sequence for the feature becomes the srcfeature_id + of the feature in the featureloc table for that feature. That featureloc + generally assigned a rank of zero if there are other locations associated + with this feature (for instance, for a match feature), the other locations + will be assigned featureloc.rank values greater than zero. + + =item Column 2 (source) + + The source is stored as a dbxref. The chado instance must of an entry + in the db table named 'GFF_source'. The script will then create a dbxref + entry for the feature's source and associate it to the feature via + the feature_dbxref table. + + =item Column 3 (type) + + The cvterm.cvterm_id of the SOFA type is stored in feature.type_id. + + =item Column 4 (start) + + The value of start minus 1 is stored in featureloc.fmin (one is subtracted + because chado uses interbase coordinates, whereas GFF uses base coordinates). + + =item Column 5 (end) + + The value of end is stored in featureloc.fmax. + + =item Column 6 (score) + + The score is stored in one of the score columns in the analysisfeature + table. The default is analysisfeature.significance. See the + section below on analysis results for more information. + + =item Column 7 (strand) + + The strand is stored in featureloc.strand. + + =item Column 8 (phase) + + The phase is stored in featureloc.phase. Note that there is currently + a problem with the chado schema for the case of single exons having + different phases in different transcripts. If your data has just such + a case, complain to gmo...@li... to find ways + to address this problem. + + =item Column 9 (group) + + Here is where the magic happens. + + =over + + =item Assigning feature.name, feature.uniquename + + The values of feature.name and feature.uniquename are assigned + according to these simple rules: + + =over + + =item If there is an ID tag, that is used as feature.uniquename + + otherwise, it is assigned a uniquename that is equal to + 'auto' concatenated with the feature_id. + + (Note that this is a potential problem as there is no check + to make sure that it is appropriately unique.) + + =item If there is a Name tag, it's value is set to feature.name; + + otherwise it is null. + + Note that these rules are much more simple than that those that + Bio::DB::GFF uses, and may need to be revisited. + + =back + + =item Assigning feature_relationship entries + + All Parent tagged features are assigned feature_relationship + entries of 'part_of' to their parent features. Note that + parent features must appear in the file before any features + use a Parent tag referring to that feature. + + =item Alias tags + + Alias values are stored in the synonym table, under + both synonym.name and synonym.synonym_sgml, and are + linked to the feature via the feature_synonym table. + + =item Dbxref tags + + Dbxref values must be of the form 'db_name:accession', where + db_name must have an entry in the db table, with a value of + db.name equal to 'DB:db_name'; several database names were preinstalled + with the database when 'make prepdb' was run. Execute 'SELECT name + FROM db' to find out what databases are already availble. New dbxref + entries are created in the dbxref table, and dbxrefs are linked to + features via the feature_dbxref table. + + =item Gap tags + + Currently is mostly ignored--the value is stored as a featureprop, + but otherwise is not used yet. + + =item Note tags + + The values are stored as featureprop entries for the feature. + + =item Any custom (ie, lowercase-first) tags + + Custom tags are supported, provided they already have an entry in the + cvterm table. Their values are stored in the featureprop table with + a type predefined the database administrator. For example, if you + have a custom tab, 'orf_classification', you need an entry in the + dbxref and cvterm tables something like this: + + INSERT INTO dbxref (db_id,accession) + VALIES ( (SELECT db_id FROM db WHERE name='null'), + 'autocreated:orf_classification'); + INSERT INTO cvterm (name,dbxref_id,cv_id) + VALUES ('orf_classification', + (SELECT cv_id FROM cv WHERE name='local'), + (SELECT dbxref_id FROM dbxref WHERE accession='autocreated:orf_classification')); + + =item Ontology_term + + When the Ontology_term tags are used, items from the Gene Ontology + and Sequence Ontology will be processed automatically when the standard + DB:accession format is used (e.g. GO:0001234). To use other ontology + terms, you must specify that mapping of the DB indentifiers in the GFF + file and the name of the ontologies in the cv table as a comma separated + tag=value pairs. For example, to use plant and cell ontology terms, + you would supply on the command line: + + --ontology 'PO=plant ontology,CL:cell ontology' + + where 'plant ontology' and 'cell ontology' are the names in the cv table + exactly as they appear. + + =item Target tags + + Oy! this one is complicated, and perhaps buggy. I'll take + a look at it soon. + + =back + + =back + =head2 NOTES *************** *** 119,157 **** substituting in the appropriate values for your organism. - =item GFF3 - - The GFF in the datafile must be version 3 due to its tighter control of - the specification and use of controlled vocabulary. Accordingly, the names - of feature types must be exactly those in the Sequence Ontology, not the - synonyms and not the accession numbers (SO accession numbers may be - supported in future versions of this script). There are several caveates - about the GFF3 that will work with this bulk loader: - - =over - =item Parents/children order Parents must come before children in the GFF file. - =item The Gap GFF reserved tag not supported - - Just flat out not supported yet--if you would like to see support, contact - the authors - - =item Any custom (ie, lowercase-first) tag is supported - - Custom tags are supported, provided they already have an entry in the - cvterm table. For example, if you have a custom tab, 'orf_classification', - you need an entry in the dbxref and cvterm tables something like this: - - - INSERT INTO dbxref (db_id,accession) - VALIES ( (SELECT db_id FROM db WHERE name='null'), - 'autocreated:orf_classification'); - INSERT INTO cvterm (name,dbxref_id,cv_id) - VALUES ('orf_classification', - (SELECT cv_id FROM cv WHERE name='local'), - (SELECT dbxref_id FROM dbxref WHERE accession='autocreated:orf_classification')); - =item Analysis --- 280,287 ---- *************** *** 206,226 **** =back - =item Ontology_term - - When the Ontology_term tags are used, items from the Gene Ontology - and Sequence Ontology will be processed automatically when the standard - DB:accession format is used (e.g. GO:0001234). To use other ontology - terms, you must specify that mapping of the DB indentifiers in the GFF - file and the name of the ontologies in the cv table as a comma separated - tag=value pairs. For example, to use plant and cell ontology terms, - you would supply on the command line: - - --ontology 'PO=plant ontology,CL:cell ontology' - - where 'plant ontology' and 'cell ontology' are the names in the cv table - exactly as they appear. - - =back - =head1 AUTHORS --- 336,339 ---- *************** *** 236,256 **** my ($ORGANISM, $GFFFILE, $DBNAME, $DBUSER, $DBPASS, $DBHOST, $DBPORT, $ANALYSIS, $ANALYSIS_GROUP, $GLOBAL_ANALYSIS, $NOLOAD, $VALIDATE, $NOTRANSACT, $NOSEQUENCE, $SCORE_COL, $ONTOLOGY); - if (eval {require Bio::GMOD::Config; - Bio::GMOD::Config->import(); - require Bio::GMOD::DB::Config; - Bio::GMOD::DB::Config->import(); - 1; } ) { - my $gmod_conf = $ENV{'GMOD_ROOT'} || "/var/lib/gmod" ? - Bio::GMOD::Config->new($ENV{'GMOD_ROOT'} || "/var/lib/gmod") : - Bio::GMOD::Config->new(); - my $db_conf = Bio::GMOD::DB::Config->new($gmod_conf,'default'); - $DBNAME = $db_conf->name(); - $DBUSER = $db_conf->user(); - $DBPASS = $db_conf->password(); - $DBHOST = $db_conf->host(); - $DBPORT = $db_conf->port(); - $ORGANISM=$db_conf->organism(); - } - GetOptions( 'organism=s' => \$ORGANISM, --- 349,352 ---- *************** *** 267,273 **** 'nosequence' => \$NOSEQUENCE, 'score_col=s'=> \$SCORE_COL, ! 'ontology=s' => \$ONTOLOGY ) or ( system( 'pod2text', $0 ), exit -1 );; $ORGANISM ||='human'; $GFFFILE ||='stdin'; #nobody better name their file 'stdin' --- 363,389 ---- 'nosequence' => \$NOSEQUENCE, 'score_col=s'=> \$SCORE_COL, ! 'ontology=s' => \$ONTOLOGY, ) or ( system( 'pod2text', $0 ), exit -1 );; + + unless ($DBNAME) { + if (eval {require Bio::GMOD::Config; + Bio::GMOD::Config->import(); + require Bio::GMOD::DB::Config; + Bio::GMOD::DB::Config->import(); + 1; } ) { + my $gmod_conf = $ENV{'GMOD_ROOT'} || "/var/lib/gmod" ? + Bio::GMOD::Config->new($ENV{'GMOD_ROOT'} || "/var/lib/gmod") : + Bio::GMOD::Config->new(); + my $db_conf = Bio::GMOD::DB::Config->new($gmod_conf,'default'); + $DBNAME = $db_conf->name(); + $DBUSER = $db_conf->user(); + $DBPASS = $db_conf->password(); + $DBHOST = $db_conf->host(); + $DBPORT = $db_conf->port(); + $ORGANISM=$db_conf->organism(); + } + } + $ORGANISM ||='human'; $GFFFILE ||='stdin'; #nobody better name their file 'stdin' *************** *** 309,313 **** if ($ONTOLOGY) { my @pairs = split /\,/, $ONTOLOGY; ! foreach @pairs { my ($tag, $value) = split/\=/; $cache{ontology}{$tag} = $value; --- 425,429 ---- if ($ONTOLOGY) { my @pairs = split /\,/, $ONTOLOGY; ! foreach (@pairs) { my ($tag, $value) = split/\=/; $cache{ontology}{$tag} = $value; *************** *** 615,618 **** --- 731,737 ---- my($uniquename) = ($feature->annotation->get_Annotations('ID'))[0] || "auto$nextfeature"; $uniquename = $uniquename->value if ref($uniquename); + + #check here to see if this uniquename is valid! + my($name) = ($feature->annotation->get_Annotations('Name'))[0] || "$featuretype-$uniquename"; $name = $name->value if ref($name); *************** *** 649,652 **** --- 768,795 ---- } + if ($feature->annotation->get_Annotations('Gap')) { + my @notes = map {$_->value} $feature->annotation->get_Annotations('Gap'); + my $rank = 0; + foreach my $note (@notes) { + unless ($cache{type}{'Gap'}) { + my $sth = + $db->prepare("SELECT cvterm_id FROM cvterm WHERE name='Gap' + AND cv_id in + (SELECT cv_id FROM cv WHERE name='null' OR + name='local')"); + $sth->execute(); + ($cache{type}{'Gap'}) = $sth->fetchrow_array; + } + + if ( !$constraint{featureprop_c1}{ $cache{feature}{$uniquename} }{ $cache{type}{'Gap'}}{ $rank } ) { + $constraint{featureprop_c1}{ $cache{feature}{$uniquename} }{ $cache{type}{'Gap'}}{ $rank }++; + print FPROP join("\t",($nextfeatureprop,$cache{feature}{$uniquename},$cache{type}{'Gap'},uri_unescape($note),$rank)),"\n"; + $rank++; + $nextfeatureprop++; + } + } + } + + if ($feature->annotation->get_Annotations('Note')) { my @notes = map {$_->value} $feature->annotation->get_Annotations('Note'); *************** *** 781,785 **** my ($dbxref) = $search_dbxref->fetchrow_array; ! warn "couldn't find $term in dbxref\n" and next unless $dbxref; $search_cvterm_id_w_dbxref->execute($dbxref); --- 924,928 ---- my ($dbxref) = $search_dbxref->fetchrow_array; ! warn "couldn't find $term in dbxref for db:$cache{ontology}{$d} ($d)\n" and next unless $dbxref; $search_cvterm_id_w_dbxref->execute($dbxref); |