|
From: <nm...@us...> - 2013-05-29 16:35:37
|
Revision: 25287
http://gmod.svn.sourceforge.net/gmod/?rev=25287&view=rev
Author: nm249
Date: 2013-05-29 16:35:23 +0000 (Wed, 29 May 2013)
Log Message:
-----------
optional loading of entire ncbi taxonomy should work now
Modified Paths:
--------------
schema/trunk/chado/bin/load_ncbi_taxonomy.pl
Modified: schema/trunk/chado/bin/load_ncbi_taxonomy.pl
===================================================================
--- schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-03-04 22:27:55 UTC (rev 25286)
+++ schema/trunk/chado/bin/load_ncbi_taxonomy.pl 2013-05-29 16:35:23 UTC (rev 25287)
@@ -290,68 +290,76 @@
#read in the taxonomy tree
open( NODE, "nodes.dmp" );
while ( my $line = <NODE> ) {
+ chomp $line;
my ( $id, $parent, $level ) = split /\s+\|\s+/, $line;
###message("id = $id, parent = $parent, level = $level\n",1);
- next unless $okay_level{ $level };
-
- # check for data consistency
- if ($infile) {
+ if ( !($okay_level{ $level } ) ) {
+ message("Node rank not found in the database (" . $okay_level{ $level } . "). Skipping\n");
+ next;
+ }
+ # check for data consistency
+ if ($infile) {
if ( exists $tax_file{$id} ) {
# check if the parent is in the taxfile
if ( $parent && !(exists $tax_file{ $parent } ) ) {
- message ("Parent $parent for tax_id $id does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1);
+ message ("Parent $parent for tax_id '" . $id . "' does not exist in your input file ! This means $id is your root, or you need to check your input!\n",1);
}
} else{ next(); } # skip nodes not in tax_file
}
-
+
###message("STORING NODE is node hash\n",1);
- $node{ $id }{ 'parent_taxid' } = $parent;
- $node{ $id }{ 'self_taxid' } = $id;
- $node{ $id }{ 'level' } = $level;
+ $node{$id}{ 'parent_taxid' } = $parent;
+ $node{$id}{ 'self_taxid' } = $id;
+ $node{$id}{ 'level' } = $level;
}
close( NODE );
open( NAME, "names.dmp" );
while ( my $line = <NAME> ) {
- #next unless $line =~ /scientific name/;
+ chomp $line;
my ( $id, $name ) = split /\s+\|\s+/, $line;
###message("NAMES: id = $id, name = $name\n",1);
- next unless $node{ $id }; #skip nodes
+ next unless $node{$id}; #skip nodes
if ( $line =~ /scientific name/) {
###message("Storing scientific name '$name'\n",1);
- $node{ $id }{ 'name' } = $name;
- $node{ $id }{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++;
+ $node{$id}{ 'name' } = $name;
+ $node{$id}{ 'name' } .= " Taxonomy:$id" if $seen{ $name }++;
} elsif ( $line =~ /common name/) { # genbank common names
- push(@{ $node{ $id }{ 'common_name' } } , $name);
+ push(@{ $node{$id}{ 'common_name' } } , $name);
push(@{ $node{$id}{ 'synonyms' } }, $name);
-
+
} elsif ( $line =~ /synonym/ ) {
push @{ $node{$id}{ 'synonyms' } }, $name;
}
+ # populate $tax_file with the ids from the names file if a filtering file is not passed
+ if (!$infile) {
+ $tax_file{$id} = $id;
+ }
+ #
}
close( NAME );
foreach my $id ( keys %node ) {
- ###message("Looking at id $id in node hash... level = " . $node{ $id }{'level'} . "\n",1);
- my $parent_taxid = $node{ $id }{ 'parent_taxid' } ;
- if (!$tax_file{$parent_taxid}) {
- message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! This means your species is the root node, or there is an error in yout input file \n", 1);
+ ##message("Looking at id '" . $id . "' in node hash... level = " . $node{$id}{'level'} . "\n",1);
+ my $parent_taxid = $node{$id}{ 'parent_taxid' } ;
+ if (!$tax_file{$parent_taxid}) {
+ message("No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! This means your species is the root node, or there is an error in yout input file \n\n", 1);
}
if ( $node{ $id }{ 'level' } eq 'species' ) {
# load the genus name from the parent_taxid
if (!$tax_file{$parent_taxid}) {
- die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = $id) !! Check your input file !!\n" ;
+ die "No parent id found for species " . $node{ $id }{ 'name' } . " (id = '" . $id . "') !! Check your input file !!\n" ;
}
-
+
$node{ $id }{ 'genus' } = $node{ $parent_taxid }{ 'name' };
$node{ $id }{ 'species' } = $node{ $id }{ 'name' };
###message("FOUND SPECIES: " . $node{ $id }{ 'name' } . " genus = " . $node{ $id }{ 'genus' } . "\n" , 1);
-
+
} else {
- ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1);
-
+ ###message("FOUND NODE NAME: " . $node{ $node{ $id }{ 'parent_taxid' }}{ 'name' } . "( genus = " . $node{ $id }{ 'level' } . " species = " . $node{ $id }{ 'name' } . "\n",1);
+
$node{ $id }{ 'genus' } = $node{ $id }{ 'level' };
$node{ $id }{ 'species' } = $node{ $id }{ 'name' };
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|