From: <nm...@us...> - 2013-05-29 16:35:37
|
Revision: 25287 http://gmod.svn.sourceforge.net/gmod/?rev=25287&view=rev Author: nm249 Date: 2013-05-29 16:35:23 +0000 (Wed, 29 May 2013) Log Message: ----------- optional loading of entire ncbi taxonomy should work now Modified Paths: -------------- schema/trunk/chado/bin/load_ncbi_taxonomy.pl Modified: schema/trunk/chado/bin/load_ncbi_taxonomy.pl =================================================================== --- schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-03-04 22:27:55 UTC (rev 25286) +++ schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-05-29 16:35:23 UTC (rev 25287) @@ -290,68 +290,76 @@ #read in the taxonomy tree open( NODE, "nodes.dmp" ); while ( my $line = <NODE> ) { + chomp $line; my ( $id, $parent, $level ) = split /\s+\|\s+/, $line; ###message("id = $id, parent = $parent, level = $level\n",1); - next unless $okay_level{ $level }; - - # check for data consistency - if ($infile) { + if ( !($okay_level{ $level } ) ) { + message("Node rank not found in the database (" . $okay_level{ $level } . "). Skipping\n"); + next; + } + # check for data consistency + if ($infile) { if ( exists $tax_file{$id} ) { # check if the parent is in the taxfile if ( $parent && !(exists $tax_file{ $parent } ) ) { - message ("Parent $parent for tax_id $id does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1); + message ("Parent $parent for tax_id '" . $id . "' does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1); } } else{ next(); } # skip nodes not in tax_file } - + ###message("STORING NODE is node hash\n",1); - $node{ $id }{ 'parent_taxid' } = $parent; - $node{ $id }{ 'self_taxid' } = $id; - $node{ $id }{ 'level' } = $level; + $node{$id}{ 'parent_taxid' } = $parent; + $node{$id}{ 'self_taxid' } = $id; + $node{$id}{ 'level' } = $level; } close( NODE ); open( NAME, "names.dmp" ); while ( my $line = <NAME> ) { - #next unless $line =~ /scientific name/; + chomp $line; my ( $id, $name ) = split /\s+\|\s+/, $line; ###message("NAMES: id = $id, name = $name\n",1); - next unless $node{ $id }; #skip nodes + next unless $node{$id}; #skip nodes if ( $line =~ /scientific name/) { ###message("Storing scientific name '$name'\n",1); - $node{ $id }{ 'name' } = $name; - $node{ $id }{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++; + $node{$id}{ 'name' } = $name; + $node{$id}{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++; } elsif ( $line =~ /common name/) { # genbank common names - push(@{ $node{ $id }{ 'common_name' } } , $name); + push(@{ $node{$id}{ 'common_name' } } , $name); push(@{ $node{$id}{ 'synonyms' } }, $name); - + } elsif ( $line =~ /synonym/ ) { push @{ $node{$id}{ 'synonyms' } }, $name; } + # populate $tax_file with the ids from the names file if a filtering file is not passed + if (!$infile) { + $tax_file{$id} = $id; + } + # } close( NAME ); foreach my $id ( keys %node ) { - ###message("Looking at id $id in node hash... level = " . $node{ $id }{'level'} . "\n",1); - my $parent_taxid = $node{ $id }{ 'parent_taxid' } ; - if (!$tax_file{$parent_taxid}) { - message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! This means your species is the root node, or there is an error in yout input file \n", 1); + ##message("Looking at id '" . $id . "' in node hash... level = " . $node{$id}{'level'} . "\n",1); + my $parent_taxid = $node{$id}{ 'parent_taxid' } ; + if (!$tax_file{$parent_taxid}) { + message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! This means your species is the root node, or there is an error in yout input file \n\n", 1); } if ( $node{ $id }{ 'level' } eq 'species' ) { # load the genus name from the parent_taxid if (!$tax_file{$parent_taxid}) { - die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! Check your input file !!\n" ; + die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! Check your input file !!\n" ; } - + $node{ $id }{ 'genus' } = $node{ $parent_taxid }{ 'name' }; $node{ $id }{ 'species' } = $node{ $id }{ 'name' }; ###message("FOUND SPECIES: " . $node{ $id }{ 'name' } . " genus = " . $node{ $id }{ 'genus' } . "\n" , 1); - + } else { - ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1); - + ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1); + $node{ $id }{ 'genus' } = $node{ $id }{ 'level' }; $node{ $id }{ 'species' } = $node{ $id }{ 'name' }; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |