From: <nm...@us...> - 2011-06-29 20:24:10
|
Revision: 25074 http://gmod.svn.sourceforge.net/gmod/?rev=25074&view=rev Author: nm249 Date: 2011-06-29 20:24:02 +0000 (Wed, 29 Jun 2011) Log Message: ----------- count var in recursive loop set to global Modified Paths: -------------- schema/trunk/chado/bin/load_ncbi_taxonomy.pl Modified: schema/trunk/chado/bin/load_ncbi_taxonomy.pl =================================================================== --- schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2011-06-29 19:19:12 UTC (rev 25073) +++ schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2011-06-29 20:24:02 UTC (rev 25074) @@ -517,10 +517,10 @@ foreach (@updates) { $dbh->do( $_ ); } sub walktree { - my $id = shift; - my $ctr = shift; - - my $children = $dbh->prepare("SELECT phylonode_id + my $phylonode_id = shift; + our $ctr = shift; + message("walking the tree for id $phylonode_id, index count is $ctr\n",1); + my $children = $dbh->prepare("SELECT phylonode_id, organism_id FROM tmp_phylonode WHERE parent_phylonode_id = ?"); my $setleft = $dbh->prepare("UPDATE tmp_phylonode @@ -529,20 +529,18 @@ my $setright = $dbh->prepare("UPDATE tmp_phylonode SET right_idx = ? WHERE phylonode_id = ?"); - - message("\nwalking the tree for $id...\n" , 1); - $setleft->execute($ctr++, $id); - message("Setting left index= $ctr for parent $id\n\n",1); - $children->execute($id); - - while(my ($id) = $children->fetchrow_array() ) { - message( "Found child_id $id \n",1); - walktree($id, $ctr); + + $setleft->execute($ctr++, $phylonode_id); + message("Setting left index= $ctr for parent $phylonode_id\n\n",1); + $children->execute($phylonode_id); + + while(my ($child_id, $organism_id) = $children->fetchrow_array() ) { + message( "Found child_id $child_id (organism_id = $organism_id) \n",1); + walktree($child_id, $ctr); } - $setright->execute($ctr++, $id); - message( "Setting right index= $ctr for id $id\n\n",1); + $setright->execute($ctr++, $phylonode_id); + message( "Setting right index= $ctr for phylonode id $phylonode_id\n\n",1); } - }; if ($@ || $opt_t) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nm...@us...> - 2012-03-06 18:54:29
|
Revision: 25256 http://gmod.svn.sourceforge.net/gmod/?rev=25256&view=rev Author: nm249 Date: 2012-03-06 18:54:12 +0000 (Tue, 06 Mar 2012) Log Message: ----------- added lookup for organism by ncbi taxonomy id, since species names may change. This loader also assumes species names are unique, as used in NCBI taxonomy. Modified Paths: -------------- schema/trunk/chado/bin/load_ncbi_taxonomy.pl Modified: schema/trunk/chado/bin/load_ncbi_taxonomy.pl =================================================================== --- schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2012-02-28 21:26:20 UTC (rev 25255) +++ schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2012-03-06 18:54:12 UTC (rev 25256) @@ -53,7 +53,7 @@ http://www.ncbi.nlm.nih.gov/sites/entrez?db=Taxonomy and search by taxid (e.g. txis4070[Subtree] ) -=item -p +=item -n phylotree name [optional] @@ -80,7 +80,7 @@ driver. Override driver name in gmod_config -=item -s +=item -p password. Override password in gmod_config @@ -129,7 +129,7 @@ =head1 VERISON AND DATE -Version 2.0, October 2009. +Version 2.1, March 2012. =head1 TODO @@ -147,19 +147,19 @@ use Bio::GMOD::DB::Config; use Bio::Chado::Schema; - +use Try::Tiny; use Getopt::Std; -our ($opt_H, $opt_D, $opt_v, $opt_t, $opt_i, $opt_p, $opt_g, $opt_u, $opt_s, $opt_d); +our ($opt_H, $opt_D, $opt_v, $opt_t, $opt_i, $opt_p, $opt_g, $opt_u, $opt_n, $opt_d); -getopts('H:D:i:p:g:u:s:d:tv'); +getopts('H:D:i:p:g:u:n:d:tv'); my $dbhost = $opt_H; my $dbname = $opt_D; my $infile = $opt_i; -my $phylotree_name= $opt_p || 'NCBI taxonomy tree'; +my $phylotree_name= $opt_n || 'NCBI taxonomy tree'; my $user = $opt_u; -my $pass = $opt_s; +my $pass = $opt_p; my $driver = $opt_d; my $port; @@ -196,7 +196,7 @@ $dsn .= ";host=$dbhost"; $dsn .= ";port=$port" if $port; -$schema= Bio::Chado::Schema->connect( $dsn, $user, $pass, { AutoCommit=>0 }); +$schema= Bio::Chado::Schema->connect( $dsn, $user, $pass||''); $dbh=$schema->storage->dbh(); @@ -377,7 +377,7 @@ my %phylonode=(); my $node_count=0; -eval { +my $coderef = sub { my $root_id; my $organism_id = $maxval{'organism'}; NODE: foreach my $id ( keys %node ) { @@ -417,10 +417,13 @@ if (!$genus || !$species) { die "NO GENUS OR SPECIES FOUND FOR tax_id $genbank_taxon_accession! Check your input file! \n" ; } $organism = $schema->resultset('Organism::Organism')->search( { - genus => { 'ilike'=> $genus }, species => {'ilike' => $species } - })->single(); - if (!$organism) { #create a new empty row object + })->single; # lookup is by species only . NCBI species should be unique! + if (!$organism) { #maybe the organism is already loaded with the ncbi taxonomy id, but the species name has changed? + my $organism_dbxref = $dbxref->organism_dbxrefs->single; + $organism= $organism_dbxref->organism if $organism_dbxref; + } + if (!$organism) { #create a new empty row object $organism = $schema->resultset('Organism::Organism')->new({}); $insert=1; } else { $update = 1; } @@ -433,12 +436,12 @@ if ($update) { $organism->update(); message( "*Updating organism " . $organism->get_column('organism_id') . " (species=" . $organism->species . ")\n", 1); - } + } if ($insert) { $organism->insert(); - message("New organism " . $organism->get_column('organism_id') . " (species=" . $organism->species . ")\n", 1); + message("New organism " . $organism->get_column('organism_id') . " (species=" . $organism->species . ")\n", 1); } - my $organism_id= $organism->get_column('organism_id'); + my $organism_id= $organism->get_column('organism_id'); ########################################### #store the organism synonyms @@ -541,13 +544,16 @@ $setright->execute($ctr++, $phylonode_id); message( "Setting right index= $ctr for phylonode id $phylonode_id\n\n",1); } + if ($opt_t) { die "TEST RUN! rolling back!\n"; } }; +try { + $schema->txn_do($coderef); + message( "Commiting!! \n"); + message("Inserted $node_count phylonodes. \n",1 ); -if ($@ || $opt_t) { - $dbh->rollback(); +} catch { + message( "An error occured! Rolling back! \n $_ \n Resetting database sequences...\n", 1); - message( "Rolling back! \n $@\n Resetting database sequences...\n", 1); - #reset sequences foreach my $key ( keys %seq ) { my $value= $seq{$key}; @@ -555,14 +561,8 @@ if ($maxvalue) { $dbh->do("SELECT setval ('$value', $maxvalue, true)") ; } else { $dbh->do("SELECT setval ('$value', 1, false)"); } } -}else { - message( "Commiting!! \n"); - message("Inserted $node_count phylonodes. \n",1 ); - - $dbh->commit(); -} +}; - sub set_maxval { my $key=shift; my $id_column= $key . "_id"; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <nm...@us...> - 2013-05-29 16:35:37
|
Revision: 25287 http://gmod.svn.sourceforge.net/gmod/?rev=25287&view=rev Author: nm249 Date: 2013-05-29 16:35:23 +0000 (Wed, 29 May 2013) Log Message: ----------- optional loading of entire ncbi taxonomy should work now Modified Paths: -------------- schema/trunk/chado/bin/load_ncbi_taxonomy.pl Modified: schema/trunk/chado/bin/load_ncbi_taxonomy.pl =================================================================== --- schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-03-04 22:27:55 UTC (rev 25286) +++ schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-05-29 16:35:23 UTC (rev 25287) @@ -290,68 +290,76 @@ #read in the taxonomy tree open( NODE, "nodes.dmp" ); while ( my $line = <NODE> ) { + chomp $line; my ( $id, $parent, $level ) = split /\s+\|\s+/, $line; ###message("id = $id, parent = $parent, level = $level\n",1); - next unless $okay_level{ $level }; - - # check for data consistency - if ($infile) { + if ( !($okay_level{ $level } ) ) { + message("Node rank not found in the database (" . $okay_level{ $level } . "). Skipping\n"); + next; + } + # check for data consistency + if ($infile) { if ( exists $tax_file{$id} ) { # check if the parent is in the taxfile if ( $parent && !(exists $tax_file{ $parent } ) ) { - message ("Parent $parent for tax_id $id does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1); + message ("Parent $parent for tax_id '" . $id . "' does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1); } } else{ next(); } # skip nodes not in tax_file } - + ###message("STORING NODE is node hash\n",1); - $node{ $id }{ 'parent_taxid' } = $parent; - $node{ $id }{ 'self_taxid' } = $id; - $node{ $id }{ 'level' } = $level; + $node{$id}{ 'parent_taxid' } = $parent; + $node{$id}{ 'self_taxid' } = $id; + $node{$id}{ 'level' } = $level; } close( NODE ); open( NAME, "names.dmp" ); while ( my $line = <NAME> ) { - #next unless $line =~ /scientific name/; + chomp $line; my ( $id, $name ) = split /\s+\|\s+/, $line; ###message("NAMES: id = $id, name = $name\n",1); - next unless $node{ $id }; #skip nodes + next unless $node{$id}; #skip nodes if ( $line =~ /scientific name/) { ###message("Storing scientific name '$name'\n",1); - $node{ $id }{ 'name' } = $name; - $node{ $id }{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++; + $node{$id}{ 'name' } = $name; + $node{$id}{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++; } elsif ( $line =~ /common name/) { # genbank common names - push(@{ $node{ $id }{ 'common_name' } } , $name); + push(@{ $node{$id}{ 'common_name' } } , $name); push(@{ $node{$id}{ 'synonyms' } }, $name); - + } elsif ( $line =~ /synonym/ ) { push @{ $node{$id}{ 'synonyms' } }, $name; } + # populate $tax_file with the ids from the names file if a filtering file is not passed + if (!$infile) { + $tax_file{$id} = $id; + } + # } close( NAME ); foreach my $id ( keys %node ) { - ###message("Looking at id $id in node hash... level = " . $node{ $id }{'level'} . "\n",1); - my $parent_taxid = $node{ $id }{ 'parent_taxid' } ; - if (!$tax_file{$parent_taxid}) { - message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! This means your species is the root node, or there is an error in yout input file \n", 1); + ##message("Looking at id '" . $id . "' in node hash... level = " . $node{$id}{'level'} . "\n",1); + my $parent_taxid = $node{$id}{ 'parent_taxid' } ; + if (!$tax_file{$parent_taxid}) { + message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! This means your species is the root node, or there is an error in yout input file \n\n", 1); } if ( $node{ $id }{ 'level' } eq 'species' ) { # load the genus name from the parent_taxid if (!$tax_file{$parent_taxid}) { - die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! Check your input file !!\n" ; + die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! Check your input file !!\n" ; } - + $node{ $id }{ 'genus' } = $node{ $parent_taxid }{ 'name' }; $node{ $id }{ 'species' } = $node{ $id }{ 'name' }; ###message("FOUND SPECIES: " . $node{ $id }{ 'name' } . " genus = " . $node{ $id }{ 'genus' } . "\n" , 1); - + } else { - ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1); - + ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1); + $node{ $id }{ 'genus' } = $node{ $id }{ 'level' }; $node{ $id }{ 'species' } = $node{ $id }{ 'name' }; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |