From: <sfr...@us...> - 2011-03-30 18:58:32
|
Revision: 780 http://treebase.svn.sourceforge.net/treebase/?rev=780&view=rev Author: sfrgpiel Date: 2011-03-30 18:58:25 +0000 (Wed, 30 Mar 2011) Log Message: ----------- Schema and scripts for creating a vToL. Added Paths: ----------- trunk/treebase-derivatives/vToL/db/vtol.zip trunk/treebase-derivatives/vToL/src/build_vtol.pl trunk/treebase-derivatives/vToL/src/classification_to_tree.pl trunk/treebase-derivatives/vToL/src/download_vtol.pl trunk/treebase-derivatives/vToL/src/query_vtol.pl trunk/treebase-derivatives/vToL/src/tree_to_classification.pl trunk/treebase-derivatives/vToL/src/upload_tree.pl Added: trunk/treebase-derivatives/vToL/db/vtol.zip =================================================================== (Binary files differ) Property changes on: trunk/treebase-derivatives/vToL/db/vtol.zip ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/treebase-derivatives/vToL/src/build_vtol.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/build_vtol.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/build_vtol.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,263 @@ +#!/usr/bin/perl + +# Script finds species present in the classification, yet missing +# from the phylogeny, and maps them to the most recent compatible node +# on the phylogeny. The most recent compatible node is the smallest clade +# on the phylogeny whose descendants intersect with the descendants of a +# clade on the classification but do not contain any that intersect with +# names on the classification that are outside of the clade. + +use strict; +use DBI; + +# check that the right number of arguments are listed +die "Input error! Usage: perl build_vtol.pl tree_id\n" if (@ARGV < 1); + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +my $tree_id = shift @ARGV; +my $new_nodes_table = "my_edited_classification"; + + +# STEP 1 +# find MRCA in the classification for all leaf nodes in tree_id x + +my $statement = "SELECT tax_id, name_txt, left_id, right_id +FROM $new_nodes_table JOIN ncbi_names USING (tax_id) +WHERE left_id <= ( SELECT MIN(left_id) FROM $new_nodes_table WHERE tax_id IN ( + SELECT tx.taxid + FROM nodes nds JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.taxid > 0 + AND nds.tree_id = $tree_id + AND (nds.right_id - nds.left_id = 1) + ) ) +AND right_id >= ( SELECT MAX(right_id) FROM $new_nodes_table WHERE tax_id IN ( + SELECT tx.taxid + FROM nodes nds JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.taxid > 0 + AND nds.tree_id = $tree_id + AND (nds.right_id - nds.left_id = 1) + ) ) +AND name_class = 'scientific name' +ORDER BY right_id +LIMIT 1"; + + +my ($mrca_ncbi_taxid, $mrca_ncbi_name, $mrca_ncbi_left_id, $mrca_ncbi_right_id) = $dbh->selectrow_array ($statement); + +print "\n\nTree tree_id = $tree_id\n\n"; +print "taxid of the MRCA in NCBI = $mrca_ncbi_taxid\n"; +print "name of the MRCA in NCBI = $mrca_ncbi_name\n\n"; + +if ($mrca_ncbi_taxid == 0) { + my $sc = $dbh->disconnect; + exit; +} + + +# STEP 2 +# list of classification names that potentially could be mapped +# to the tree with the given tree_id + +$statement = "SELECT nnds.tax_id +FROM $new_nodes_table nnds JOIN $new_nodes_table inc + ON (nnds.left_id BETWEEN inc.left_id AND inc.right_id) +WHERE inc.tax_id = ( + SELECT tax_id + FROM $new_nodes_table + WHERE left_id <= ( SELECT MIN(left_id) FROM $new_nodes_table WHERE tax_id IN ( + SELECT tx.taxid + FROM nodes nds JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.taxid > 0 + AND nds.tree_id = $tree_id + AND (nds.right_id - nds.left_id = 1) + ) ) + AND right_id >= ( SELECT MAX(right_id) FROM $new_nodes_table WHERE tax_id IN ( + SELECT tx.taxid + FROM nodes nds JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.taxid > 0 + AND nds.tree_id = $tree_id + AND (nds.right_id - nds.left_id = 1) + ) ) + ORDER BY right_id + LIMIT 1 +) +EXCEPT +SELECT tx.taxid +FROM nodes nds JOIN taxon_variants tv USING (taxon_variant_id) +JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) +WHERE nds.tree_id = $tree_id +AND tx.taxid > 0 +AND (nds.right_id - nds.left_id = 1)"; + +my $sth = $dbh->prepare($statement) +or die "Can't prepare $statement: $dbh->errstr\n"; +my $rv = $sth->execute +or die "can't execute the query: $sth->errstr\n"; + +my @ncbi_list; +my %ncbi_hash; +while(my @row = $sth->fetchrow_array) { + push(@ncbi_list, $row[0]); + $ncbi_hash{ $row[0] } = $row[0]; +} +my $rd = $sth->finish; + +print "Number of classification names that potentially could be mapped to tree $tree_id: $#ncbi_list \n"; +print "i.e., they are all descendants from the MRCA of tree $tree_id (i.e. $mrca_ncbi_name), \n"; +print "excluding names that are already in tree $tree_id.\n\n"; + +# STEP 3 +# List all classification internal nodes bounded by the phylogeny. +# Order by smallest clade to largest + +print "List all ncbi internal nodes in the NCBI $mrca_ncbi_name \n"; +print "subtree order by smallest clade to largest: \n"; +print " taxid left_id right_id clade size\n"; + +$statement = "SELECT nnds.tax_id, nnds.left_id, nnds.right_id, (nnds.right_id - nnds.left_id) - 1 AS clade_size +FROM $new_nodes_table nnds +WHERE nnds.left_id >= $mrca_ncbi_left_id +AND nnds.right_id <= $mrca_ncbi_right_id +AND (nnds.right_id - nnds.left_id) > 1 +ORDER BY (nnds.right_id - nnds.left_id) "; + +my $sth = $dbh->prepare($statement) +or die "Can't prepare $statement: $dbh->errstr\n"; +my $rv = $sth->execute +or die "can't execute the query: $sth->errstr\n"; + +my @ncbi_internalnode_list; +my %ncbi_internalnode_left; +my %ncbi_internalnode_right; +while(my @row = $sth->fetchrow_array) { + printf (" %10d %10d %10d %10d\n", @row ); + push(@ncbi_internalnode_list, $row[0]); + $ncbi_internalnode_left{ $row[0] } = $row[1]; + $ncbi_internalnode_right{ $row[0] } = $row[2]; +} +my $rd = $sth->finish; +print "\n"; + +# STEP 4 +# For each classification clade, see if there is a MRCA equivalent in tree_id $tree_id + +print "For each ncbi clade, see if there is an equivalent clade in tree_id $tree_id:\n\n"; +print "ncbi_taxid -> tree node_id\n"; + +$statement = "SELECT onds.node_id +FROM nodes onds +WHERE onds.tree_id = ? +AND onds.left_id <= ( + SELECT MIN(nds.left_id) + FROM nodes nds + JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + JOIN $new_nodes_table nnds ON (tx.taxid = nnds.tax_id) + WHERE nnds.left_id >= ? + AND nnds.left_id <= ? + AND nds.tree_id = ? +) +AND onds.right_id >= ( + SELECT MAX(nds.right_id) + FROM nodes nds + JOIN taxon_variants tv USING (taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + JOIN $new_nodes_table nnds ON (tx.taxid = nnds.tax_id) + WHERE nnds.left_id >= ? + AND nnds.left_id <= ? + AND nds.tree_id = ? +) +ORDER BY onds.right_id +LIMIT 1"; + +my $sth = $dbh->prepare($statement) +or die "Can't prepare $statement: $dbh->errstr\n"; + +my %mapped_node; +my $node_in_tree; +for ( my $j=0; $j < @ncbi_internalnode_list; $j++ ) { + my $rv = $sth->execute( $tree_id, $ncbi_internalnode_left{ $ncbi_internalnode_list[$j] }, $ncbi_internalnode_right{ $ncbi_internalnode_list[$j] }, $tree_id, $ncbi_internalnode_left{ $ncbi_internalnode_list[$j] }, $ncbi_internalnode_right{ $ncbi_internalnode_list[$j] }, $tree_id ); + ($node_in_tree) = $sth->fetchrow_array; + $mapped_node{ $ncbi_internalnode_list[$j] } = $node_in_tree if ($node_in_tree); + print " $ncbi_internalnode_list[$j] -> $node_in_tree\n" if ($node_in_tree); +} +my $rd = $sth->finish; +print "\n"; + + +# STEP 5 +# For each classification clade, going from smallest to largest, see if there are +# any descendants that can be mapped + +print "For each ncbi clade going from smallest to largest, take all descendants and \n"; +print "attach them to the equivalent clade in the tree:\n\n"; +print "ncbi_taxid -> tree node_id\n"; + +$statement = "SELECT nnds.tax_id, nna.name_txt +FROM ncbi_names nna JOIN $new_nodes_table nnds USING (tax_id) +JOIN $new_nodes_table ninc ON (nnds.left_id BETWEEN ninc.left_id AND ninc.right_id) +WHERE (nnds.right_id - nnds.left_id = 1) +AND nna.name_class = 'scientific name' +AND ninc.tax_id = ? "; +my $sth = $dbh->prepare($statement) +or die "Can't prepare $statement: $dbh->errstr\n"; + + +for ( my $j=0; $j < @ncbi_internalnode_list; $j++ ) { + if ( defined ( $mapped_node{ $ncbi_internalnode_list[$j] } ) ) { + my $rv = $sth->execute( $ncbi_internalnode_list[$j] ); + while(my @row = $sth->fetchrow_array) { + if ( defined($ncbi_hash{ $row[0] }) ) { + print "map $row[0] ($row[1]) to $mapped_node{ $ncbi_internalnode_list[$j] } \n"; + + my $pq_id; + + $pq_id = $dbh->selectrow_array("SELECT pq_id FROM pq WHERE tax_id = $row[0] "); + + if ( !($pq_id) ) { + $dbh->do( "INSERT INTO pq (taxon_name, tax_id) VALUES (?, ?) ", undef, $row[1], $row[0] ); + $pq_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'pq_id_seq'}); + } + + $dbh->do( "DELETE FROM pq_subtree WHERE pq_id = ? AND tree_id = ? ", undef, $pq_id, $tree_id ); + $dbh->do( "INSERT INTO pq_subtree (pq_id, tree_id, node_id) VALUES (?, ?, ?) ", undef, $pq_id, $tree_id, $mapped_node{ $ncbi_internalnode_list[$j] } ); + + delete($ncbi_hash{ $row[0] }); + } + } + } +} + +print "\n"; + + + + +my $sc = $dbh->disconnect; +exit; + + + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + #$cstr .= ";host=dev.nescent.org"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 0, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} Added: trunk/treebase-derivatives/vToL/src/classification_to_tree.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/classification_to_tree.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/classification_to_tree.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,118 @@ +#!/usr/bin/perl + +# Script exports a clade from the a classification (NCBI's by +# default) to a NEXUS style tree file. This is useful if you'd like to +# use Mesquite to improve the classification -- e.g. you can +# create unnamed ranks that are not present in NCBI, or can +# add in missing species. After editing the classification tree, you +# can re-import it into the database to use with your vToL + +use strict; +use DBI; + +my $rootid = 999999; +my $new_nodes_table = "my_edited_classification"; + +my $file = "ncbi_tree_out.tre"; +open (OUTPUT, ">$file") || die "Cannot open $file!: $!"; + +print OUTPUT "#NEXUS\n\nBEGIN TREES;\n\n"; + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +my $count = "SELECT COUNT(*) FROM ncbi_names NATURAL INNER JOIN $new_nodes_table "; +$count .= "WHERE parent_tax_id = ? AND name_class = 'scientific name'"; +#$count .= "WHERE parent_tax_id = ? "; + + +my $select = "SELECT tax_id, name_txt FROM ncbi_names NATURAL INNER JOIN $new_nodes_table "; +$select .= "WHERE parent_tax_id = ? AND name_class = 'scientific name'"; +#$select .= "WHERE parent_tax_id = ? "; + +my $children = $dbh->prepare($select); + +print OUTPUT "\tTREE mytree = "; + +&walktree($dbh, $rootid); + +print OUTPUT ";\n"; + +my $rc = $dbh->disconnect; + +print OUTPUT "\nEND;\n"; +exit; + + +# walktree +#============================================================== +sub walktree { + my $dbh = shift; + my $parent_id = shift; + my $parent_name = shift; + + + my $totRec = $dbh->selectrow_array ($count, undef, $parent_id); + + if ($totRec > 0) { + # still more children -- print a new open parenthesis + print OUTPUT "("; + } else { + # no more children -- print the OTU + print OUTPUT tokenize("$parent_name"); + } + + # get some children + $children->execute($parent_id); + + my $br = 0; + for my $row (@{$children->fetchall_arrayref}) { + $br++; + my ($child_id, $child_name) = @$row; + + #treat each child as a parent and walk the tree some more + walktree($dbh, $child_id, $child_name); + print OUTPUT "," if ($br < $totRec); + } + + if ($totRec > 0) { + print OUTPUT ")"; + print OUTPUT tokenize("$parent_name"); + } +} + +# tokenize +#============================================================== +sub tokenize { + + my $token = shift; + + $token =~ s/\'/\"/g; + + if ($token =~ m/[-\/\?\<\>\*\%\&\$\#\@\!\"\(\)]/) { + $token = "\'$token\'"; + } else { + $token =~ s/\s/_/g; + } + + return ($token); + +} + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + #$cstr .= ";host=dev.nescent.org"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 1, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} Added: trunk/treebase-derivatives/vToL/src/download_vtol.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/download_vtol.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/download_vtol.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,176 @@ +#!/usr/bin/perl + +# This script outputs a vToL as a NEXUS tree file, attaching all +# missing species (from the classification) to an extra node that +# is parent to the subtree where the missing species attaches. +# Additionally, the added species are prefixed with "O" so that +# they can be easily highlighted in a different color using PhyloWidget + +use strict; +use DBI; + +my @inputs = @ARGV; +my $map = shift(@inputs); + +if (length($inputs[0]) < 2) { + print "Input error! Usage: perl download_vToL.pl [mapping] [list of tree_ids] \n"; + exit; +} + + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +&getTrees ($dbh, $map, @inputs); + +my $rc = $dbh->disconnect; + +exit; + + + + +# get trees +#============================================================== +sub getTrees { + + my ($dbh, $map, @treeItems) = @_; + + if (@treeItems) { + + print "#NEXUS\n\nBEGIN TREES;\n\n"; + foreach my $treeid (@treeItems) { + + print " [tr_id: $treeid]\n"; + + my $statement = "SELECT root, tree_label FROM trees WHERE ( tree_id = ? )"; + my $sth = $dbh->prepare ($statement); + $sth->execute( $treeid ); + + $statement = "SELECT child_id, node_label, edge_length, edge_support "; + $statement .= "FROM edges INNER JOIN nodes ON edges.child_ID = nodes.node_id "; + $statement .= "WHERE parent_id = ?"; + my $children = $dbh->prepare ($statement); + + # return a list of mapped children + $statement = "SELECT taxon_name, tax_id FROM pq_subtree JOIN pq USING (pq_id) WHERE node_id = ? "; + my $mapping = $dbh->prepare ($statement); + + while (my @row = $sth->fetchrow_array()) { + + print "\tTREE ".&tokenize($row[1])." = "; + + &walktree($dbh, $map, $children, $mapping, $row[0]); + + print ";\n"; + + } + $sth->finish(); + } + print "\nEND;\n"; + + + } else { + print "Error: No trees requested\n"; + exit; + } + +} + +# walktree +#============================================================== +sub walktree { + my $dbh = shift; + my $map = shift; + my $children = shift; + my $mapping = shift; + my $id = shift; + my $support = shift; + my $length = shift; + + my $statement = "SELECT COUNT(*) FROM edges WHERE parent_id = $id"; + my $totRec = $dbh->selectrow_array ($statement); + + $statement = "SELECT COUNT(*) FROM pq_subtree WHERE node_id = $id"; + my $totMaps = $dbh->selectrow_array ($statement); + + if ($totRec) { + print "("; + } + if (($totMaps) && ($map)) { + print "("; + } + + + $children->execute($id); + + my $br = 0; + for my $row (@{$children->fetchall_arrayref}) { + $br++; + my ($id, $label, $edge_length, $edge_support) = @$row; + $label = "O $label" if (($map) && ($label)); + print &tokenize($label); + walktree($dbh, $map, $children, $mapping, $id, $edge_support, $edge_length); + print "," if ($br < $totRec); + } + + if ($totRec) { + + print ")"; + print &tokenize($support) if ($support); + print ":". &tokenize($length) if ($length); + } else { + print ":". &tokenize($length) if ($length); + } + + if (($totMaps) && ($map)) { + $mapping->execute($id); + for my $row (@{$mapping->fetchall_arrayref}) { + my ($mapped_label, $tax_id) = @$row; + $mapped_label = "$mapped_label"; + print ",". &tokenize($mapped_label); + } + print ")"; + } + + +} + +# tokenize -- encapsulate tokens according to nexus rules +# technically speaking, single quotes should be repeated +# e.g. change "It's nice" to "'It''s nice'", but I'd rather not +# mess with that, so I'm changing all single quotes to double +#============================================================== +sub tokenize { + + my $token = shift; + + $token =~ s/\'/\"/g; + + if ($token =~ m/[-\/\?\<\>\*\%\&\$\#\@\!\"\:]/) { + $token = "\'$token\'"; + } else { + $token =~ s/\s/_/g; + } + + return ($token); + +} + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + #$cstr .= ";host=dev.nescent.org"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 1, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} \ No newline at end of file Added: trunk/treebase-derivatives/vToL/src/query_vtol.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/query_vtol.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/query_vtol.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,143 @@ +#!/usr/bin/perl + +# Searches for the MRCA on a phylogeny based on a list of taxon names. +# Returns all the descendants of the MRCA, including taxa that were +# missing from the phylogeny but that descend from a classification rank that +# maps to the subtree with the MRCA at its origin. Additionally, it returns +# taxa from the classification that map to the direct ancestor of the MRCA + +use strict; +use DBI; + +# check that the right number of arguments are listed +die "Input error! Usage: perl query_vtol.pl tree_id <list of taxa> +e.g.: perl query_taxa.pl 9365 'Dipelta floribunda' 'Zabelia biflora' \n" if (@ARGV < 2); + +my $tree_id = shift; +my @taxonlabels = @ARGV; + +my $list_size = $#taxonlabels; +$list_size++; + +my $list_string = join ("', '", @taxonlabels ); +$list_string = "'$list_string'"; + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +print "\nFind all leaf names that descend from the MRCA of ($list_string) in tree $tree_id:\n\n"; + +my $statement = "SELECT nds.node_id, nds.node_label +FROM nodes nds JOIN nodes inc ON (nds.left_id BETWEEN inc.left_id AND inc.right_id) +WHERE nds.tree_id = $tree_id +AND inc.tree_id = nds.tree_id +AND (nds.right_id - nds.left_id = 1) +AND inc.node_id = ( + SELECT ndsP.node_id + FROM nodes ndsC JOIN node_path npth ON (ndsC.node_id = npth.child_node_id) + JOIN nodes ndsP ON (npth.parent_node_id = ndsP.node_id) + JOIN taxon_variants tv ON (ndsC.taxon_variant_id = tv.taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.namestring IN ($list_string) + AND ndsC.tree_id = $tree_id + GROUP BY ndsP.node_id, ndsP.right_id + HAVING COUNT(npth.child_node_id) >= $list_size + ORDER BY ndsP.right_id + LIMIT 1 +)"; + +my $query = $dbh->prepare ($statement); +$query->execute; + +for my $row (@{$query->fetchall_arrayref}) { + my ($node_id, $label) = @$row; + printf ( "%45s\n", $label); +} +$query->finish; + +print "\nFind all mapped names that descend from the MRCA of ($list_string) in tree $tree_id:\n\n"; + +$statement = "SELECT nds.node_id, p.taxon_name +FROM nodes nds JOIN nodes inc ON (nds.left_id BETWEEN inc.left_id AND inc.right_id) +JOIN pq_subtree pqs ON (nds.node_id = pqs.node_id) +JOIN pq p USING (pq_id) +WHERE nds.tree_id = $tree_id +AND inc.tree_id = nds.tree_id +AND inc.node_id = ( + SELECT ndsP.node_id + FROM nodes ndsC JOIN node_path npth ON (ndsC.node_id = npth.child_node_id) + JOIN nodes ndsP ON (npth.parent_node_id = ndsP.node_id) + JOIN taxon_variants tv ON (ndsC.taxon_variant_id = tv.taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.namestring IN ($list_string) + AND ndsC.tree_id = $tree_id + GROUP BY ndsP.node_id, ndsP.right_id + HAVING COUNT(npth.child_node_id) >= $list_size + ORDER BY ndsP.right_id + LIMIT 1 +)"; + +my $query = $dbh->prepare ($statement); +$query->execute; + +for my $row (@{$query->fetchall_arrayref}) { + my ($node_id, $label) = @$row; + printf ( "%45s\n", $label); +} +$query->finish; + +print "\nFind all mapped names that *might* descend from the MRCA of ($list_string) in tree $tree_id:\n\n"; + +$statement = "SELECT nds.node_id, p.taxon_name, (100.0 * (ndsize.right_id - ndsize.left_id) / (nds.right_id - nds.left_id) ) as clade_span +FROM nodes nds JOIN pq_subtree pqs ON (nds.node_id = pqs.node_id) +JOIN pq p USING (pq_id), nodes ndsize +WHERE nds.tree_id = $tree_id +AND nds.left_id < ndsize.left_id +AND nds.right_id > ndsize.right_id +AND ndsize.node_id = ( + SELECT ndsP.node_id + FROM nodes ndsC JOIN node_path npth ON (ndsC.node_id = npth.child_node_id) + JOIN nodes ndsP ON (npth.parent_node_id = ndsP.node_id) + JOIN taxon_variants tv ON (ndsC.taxon_variant_id = tv.taxon_variant_id) + JOIN taxa tx ON (tv.taxon_id = tx.taxon_id) + WHERE tx.namestring IN ($list_string) + AND ndsC.tree_id = $tree_id + GROUP BY ndsP.node_id, ndsP.right_id + HAVING COUNT(npth.child_node_id) >= $list_size + ORDER BY ndsP.right_id + LIMIT 1 +) +ORDER BY (nds.right_id - nds.left_id) DESC;"; + +my $query = $dbh->prepare ($statement); +$query->execute; + +for my $row (@{$query->fetchall_arrayref}) { + my ($node_id, $label, $clade_span) = @$row; + printf ( "%45s %5.1f\%\n", $label, $clade_span); +} +$query->finish; + + +my $rc = $dbh->disconnect; + + + + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + # $cstr .= ";host=10.9.1.1"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 1, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} \ No newline at end of file Added: trunk/treebase-derivatives/vToL/src/tree_to_classification.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/tree_to_classification.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/tree_to_classification.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,272 @@ +#!/usr/bin/perl + + +# This script takes a tree from a NEXUS file and creates a +# table to store the tree as a classification, whereupon +# you can build a vToL. So, for example, if you're unhappy with +# NCBI's classification for a certain group, using the "classification_to_tree.pl" +# script to dump the NCBI clade to a tree file, then use Mesquite +# to edit the classification tree, then use this script to re-import it. +# A new table is created to accomodate your edited classification +# so as not to disturb the original NCBI classification. + +use strict; +use warnings; +use DBI; +use Bio::Phylo::IO 'parse'; + +my ( $taxa, $matrix, $forest ); + +my $file = shift @ARGV; +my $blocks = parse( + '-format' => 'nexus', + '-file' => $file, +); + +for my $block ( @{ $blocks } ) { + $forest = $block if $block->isa('Bio::Phylo::Forest'); +} + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +my ($sth, $rv, $statement); + +my $new_nodes_table = "my_edited_classification"; +my $new_nodes_seq = "my_edited_classification_seq_taxid"; +my $highest_ncbi = 1000000; +my $start_taxid = 4199; #ncbi taxid for the Dipsacales +my $division_id = 4; +my $inherited_div_flag = 1; +my $genetic_code_id = 1; +my $inherited_gc_flag = 1; +my $mitochondrial_genetic_code_id = 1; +my $inherited_mgc_flag = 1; +my $genbank_hidden_flag = 0; +my $hidden_subtree_root_flag = 0; + +# highest tax_id = 869615 +$dbh->do( "DELETE FROM NCBI_NAMES WHERE TAX_ID > 869615" ); + +$dbh->do( "DROP TABLE IF EXISTS $new_nodes_table CASCADE" ); +$dbh->do( "DROP SEQUENCE IF EXISTS $new_nodes_seq " ); + +$statement = <<STATEMENT; +CREATE SEQUENCE $new_nodes_seq + START WITH $highest_ncbi + INCREMENT BY 1 + NO MAXVALUE + NO MINVALUE + CACHE 1 +STATEMENT +$dbh->do( "$statement" ); + +$statement = <<STATEMENT; +CREATE TABLE $new_nodes_table ( + tax_id integer DEFAULT nextval('$new_nodes_seq'::regclass) NOT NULL, + parent_tax_id integer NOT NULL, + rank character varying(32), + embl_code character varying(16), + division_id integer NOT NULL, + inherited_div_flag integer NOT NULL, + genetic_code_id integer NOT NULL, + inherited_gc_flag integer NOT NULL, + mitochondrial_genetic_code_id integer NOT NULL, + inherited_mgc_flag integer NOT NULL, + genbank_hidden_flag integer NOT NULL, + hidden_subtree_root_flag integer NOT NULL, + comments character varying(255) DEFAULT NULL::character varying, + left_id integer, + right_id integer +) +STATEMENT + +$dbh->do( "$statement" ); +$statement = "ALTER TABLE ONLY $new_nodes_table ADD CONSTRAINT $new_nodes_table" . "_pkey PRIMARY KEY (tax_id)"; +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_left_id ON $new_nodes_table USING btree (left_id)"; +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_right_id ON $new_nodes_table USING btree (right_id)"; +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_parent_tax_id ON $new_nodes_table USING btree (parent_tax_id)"; +$dbh->do( "$statement" ); + +$dbh->do( "DROP TABLE IF EXISTS $new_nodes_table"."_path CASCADE" ); +$statement = "CREATE TABLE $new_nodes_table"."_path (\n"; +$statement .= <<STATEMENT; + child_node_id bigint DEFAULT (0)::bigint NOT NULL, + parent_node_id bigint DEFAULT (0)::bigint NOT NULL, + distance integer DEFAULT 0 NOT NULL +) +STATEMENT +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_path_parent_id ON $new_nodes_table"."_path USING btree (parent_node_id)"; +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_path_child_id ON $new_nodes_table"."_path USING btree (child_node_id)"; +$dbh->do( "$statement" ); +$statement = "CREATE INDEX $new_nodes_table" . "_path_distance ON $new_nodes_table"."_path USING btree (distance)"; +$dbh->do( "$statement" ); + +# global statements for setting the left-right indexing +my $setleft = $dbh->prepare("UPDATE $new_nodes_table SET left_id = ? WHERE tax_id = ?"); +my $setright = $dbh->prepare("UPDATE $new_nodes_table SET right_id = ? WHERE tax_id = ?"); +my $ctr; + +# global statements for calculating the transitive closure +my $deletepaths = $dbh->prepare("DELETE FROM $new_nodes_table"."_path"); + +my $init_sql = "INSERT INTO $new_nodes_table"."_path (child_node_id, parent_node_id, distance) "; + $init_sql .= "SELECT tax_id, parent_tax_id, 1 FROM $new_nodes_table "; +my $initialize_paths = $dbh->prepare("$init_sql"); + +my $path_sql = "INSERT INTO $new_nodes_table"."_path (child_node_id, parent_node_id, distance) "; + $path_sql .= "SELECT n.tax_id, p.parent_node_id, p.distance+1 "; + $path_sql .= "FROM $new_nodes_table"."_path p, $new_nodes_table n "; + $path_sql .= "WHERE p.child_node_id = n.parent_tax_id "; + $path_sql .= "AND p.distance = ?"; +my $calc_paths = $dbh->prepare("$path_sql"); + +# get only the first tree +my $tree = shift ( @{ $forest->get_entities } ); + +print "###########################\n"; +print $tree->get_name . "\n"; +my $tree_name = $tree->get_name; + +# create a new node record, which will be the root of this tree +my $tree_id = 1; +my $root_node_id = $highest_ncbi - 1; +$statement = "INSERT INTO $new_nodes_table "; +$statement .= "(tax_id, parent_tax_id, division_id, inherited_div_flag, genetic_code_id, inherited_gc_flag, mitochondrial_genetic_code_id, inherited_mgc_flag, genbank_hidden_flag, hidden_subtree_root_flag) VALUES "; +$statement .= "($root_node_id, 0, $division_id, $inherited_div_flag, $genetic_code_id, $inherited_gc_flag, $mitochondrial_genetic_code_id, $inherited_mgc_flag, $genbank_hidden_flag, $hidden_subtree_root_flag)"; +$dbh->do( "$statement" ); + +# set a counter for the left-right indexing +$ctr = ($highest_ncbi * 100); +walktree( $tree->get_root , $root_node_id); + +compute_tc(); + +my $rc = $dbh->disconnect; + +exit; + + +#=================================== +sub walktree { + my $parent = shift; + my $parent_id = shift; + + $setleft->execute($ctr++, $parent_id); + + for my $child ( @{ $parent->get_children } ) { + + my $branch_length; + my $edge_support; + my $child_id; + + if (($child->get_name) && !($child->get_name =~ m/^\d*\.?\d+$/)) { + my $taxon_label = &detokenize( $child->get_name ); + + my $statement = "SELECT COUNT(*) FROM ncbi_names WHERE name_txt = " . $dbh->quote( $taxon_label ); + my $totRec = $dbh->selectrow_array ($statement); + + if ($totRec > 0) { + # there is a node name and this name already exists + $statement = "SELECT tax_id FROM ncbi_names WHERE name_txt = " . $dbh->quote( $taxon_label ) . " LIMIT 1 "; + $child_id = $dbh->selectrow_array ($statement); + + $statement = "INSERT INTO $new_nodes_table "; + $statement .= "(tax_id, parent_tax_id, division_id, inherited_div_flag, genetic_code_id, inherited_gc_flag, "; + $statement .= "mitochondrial_genetic_code_id, inherited_mgc_flag, genbank_hidden_flag, hidden_subtree_root_flag) "; + $statement .= "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) "; + $dbh->do( "$statement", undef, $child_id, $parent_id, $division_id, $inherited_div_flag, $genetic_code_id, $inherited_gc_flag, $mitochondrial_genetic_code_id, $inherited_mgc_flag, $genbank_hidden_flag, $hidden_subtree_root_flag ); + + } else { + # there is a node name, but it doesn't exist + $statement = "INSERT INTO $new_nodes_table "; + $statement .= "(parent_tax_id, division_id, inherited_div_flag, genetic_code_id, inherited_gc_flag, "; + $statement .= "mitochondrial_genetic_code_id, inherited_mgc_flag, genbank_hidden_flag, hidden_subtree_root_flag) "; + $statement .= "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) "; + $dbh->do( "$statement", undef, $parent_id, $division_id, $inherited_div_flag, $genetic_code_id, $inherited_gc_flag, $mitochondrial_genetic_code_id, $inherited_mgc_flag, $genbank_hidden_flag, $hidden_subtree_root_flag ); + + $child_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>"$new_nodes_seq"}); + $dbh->do( "INSERT INTO ncbi_names (tax_id, name_txt, name_class) VALUES (?, ?, ?) ", undef, $child_id, $taxon_label, 'scientific name' ); + } + + } else { + # there is no node name for this node + $statement = "INSERT INTO $new_nodes_table "; + $statement .= "(parent_tax_id, division_id, inherited_div_flag, genetic_code_id, inherited_gc_flag, "; + $statement .= "mitochondrial_genetic_code_id, inherited_mgc_flag, genbank_hidden_flag, hidden_subtree_root_flag) "; + $statement .= "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) "; + $dbh->do( "$statement", undef, $parent_id, $division_id, $inherited_div_flag, $genetic_code_id, $inherited_gc_flag, $mitochondrial_genetic_code_id, $inherited_mgc_flag, $genbank_hidden_flag, $hidden_subtree_root_flag ); + + $child_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>"$new_nodes_seq"}); + $dbh->do( "INSERT INTO ncbi_names (tax_id, name_txt, name_class) VALUES (?, ?, ?) ", undef, $child_id, "Unnamed Rank $child_id", 'scientific name' ); + } + + walktree( $child, $child_id ); + } + + $setright->execute($ctr++, $parent_id); +} + +# Remove nexus tokenization +#============================================================== +sub detokenize { + my $token = shift; + + $token =~ s/_/ /g; + $token =~ s/^\'//g; + $token =~ s/\'$//g; + $token =~ s/''/'/g; + + return($token); +} + +# Compute the transitive closure +#============================================================== +sub compute_tc { + + $deletepaths->execute(); + $initialize_paths->execute(); + + my $dist = 1; + my $rv = 1; + while ($rv > 0) { + $rv = $calc_paths->execute($dist); + $dist++; + } +} + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + #$cstr .= ";host=10.9.1.1"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 1, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} + +# Get the auto-added id +#============================================================== +sub last_insert_id { + my($dbh, $sequence_name) = @_; + + my $driver = $dbh->{Driver}->{Name}; + if (lc($driver) eq 'mysql') { + return $dbh->{'mysql_insertid'}; + } else { + return $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'$sequence_name'}); + } +} Added: trunk/treebase-derivatives/vToL/src/upload_tree.pl =================================================================== --- trunk/treebase-derivatives/vToL/src/upload_tree.pl (rev 0) +++ trunk/treebase-derivatives/vToL/src/upload_tree.pl 2011-03-30 18:58:25 UTC (rev 780) @@ -0,0 +1,189 @@ +#!/usr/bin/perl + +# This script parsers a NEXUS file and uploads any phylogenies in it +# to the database. When this is done, you'll want to make a mapping +# between the new taxon labels brought in with your tree and the existing +# taxonomy: +# +# UPDATE nodes SET taxon_variant_id = tv.taxon_variant_id +# FROM nodes nds JOIN taxon_variants tv ON (nds.node_label = tv.fullnamestring) +# WHERE nds.tree_id = 9365 +# AND (nds.right_id - nds.left_id = 1) +# AND nodes.node_id = nds.node_id; + +use strict; +use warnings; +use DBI; +use Bio::Phylo::IO 'parse'; + +my ( $taxa, $matrix, $forest ); + +my $file = shift @ARGV; +my $blocks = parse( + '-format' => 'nexus', + '-file' => $file, +); + +for my $block ( @{ $blocks } ) { + $forest = $block if $block->isa('Bio::Phylo::Forest'); +} + +# Fill in the database name and access credentials +my $database = ""; +my $username = ""; +my $password = ""; +my $dbh = &ConnectToPg($database, $username, $password); + +my ($sth, $rv); + +# global statements for setting the left-right indexing +my $setleft = $dbh->prepare("UPDATE nodes SET left_id = ? WHERE node_id = ?"); +my $setright = $dbh->prepare("UPDATE nodes SET right_id = ? WHERE node_id = ?"); +my $ctr; + +# global statements for calculating the transitive closure +my $deletepaths = $dbh->prepare("DELETE FROM node_path WHERE child_node_id IN (SELECT node_id FROM nodes WHERE tree_id = ?)"); + +my $init_sql = "INSERT INTO node_path (child_node_id, parent_node_id, distance) "; + $init_sql .= "SELECT e.child_id, e.parent_id, 1 FROM edges e, nodes n "; + $init_sql .= "WHERE e.child_id = n.node_id AND n.tree_id = ?"; +my $initialize_paths = $dbh->prepare("$init_sql"); + +my $path_sql = "INSERT INTO node_path (child_node_id, parent_node_id, distance)"; + $path_sql .= "SELECT e.child_id, p.parent_node_id, p.distance+1 "; + $path_sql .= "FROM node_path p, edges e, nodes n "; + $path_sql .= "WHERE p.child_node_id = e.parent_id "; + $path_sql .= "AND n.node_id = e.child_id AND n.tree_id = ? "; + $path_sql .= "AND p.distance = ?"; +my $calc_paths = $dbh->prepare("$path_sql"); + +foreach my $tree ( @{ $forest->get_entities } ) { + print "###########################\n"; + print $tree->get_name . "\n"; + my $tree_name = $tree->get_name; + + # create a new node record, which will be the root of this tree + $dbh->do( "INSERT INTO nodes (node_label) VALUES (NULL) " ); + my $root_node_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'nodes_node_id'}); + + # create a new tree record, specifying the tree's name it its root node + $dbh->do( "INSERT INTO trees (tree_label, root) VALUES ('$tree_name','$root_node_id')" ); + my $tree_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'trees_tree_id'}); + + # update the newly created node so that it knows what tree it belongs to + $dbh->do( "UPDATE nodes SET tree_id = '$tree_id' WHERE node_id = $root_node_id " ); + + # set a counter for the left-right indexing + $ctr = 1; + walktree( $tree->get_root , $tree_id, $root_node_id); + + compute_tc($tree_id); +} +my $rc = $dbh->disconnect; + +exit; + + +#=================================== +sub walktree { + my $parent = shift; + my $tree_id = shift; + my $parent_id = shift; + + $setleft->execute($ctr++, $parent_id); + + for my $child ( @{ $parent->get_children } ) { + + my $branch_length; + my $edge_support; + + # create a new child record, but only use the label if it doesn't look like a + # clade support value (i.e. a number) + if (($child->get_name) && !($child->get_name =~ m/^\d*\.?\d+$/)) { + my $taxon_label = $child->get_name; + $dbh->do( "INSERT INTO nodes (node_label, tree_id) VALUES (?, ?) ", undef, &detokenize($taxon_label), $tree_id ); + } else { + $dbh->do( "INSERT INTO nodes (node_label, tree_id) VALUES (NULL, $tree_id) " ); + } + my $child_id = $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'nodes_node_id'}); + + # capture the branch length if there is one + if ($child->get_branch_length) { + $branch_length = $child->get_branch_length; + } else { + $branch_length = undef; + } + + # capture the edge support if internal node label looks like a number + if (($child->is_internal) && ($child->get_internal_name =~ m/^\d*\.?\d+$/)) { + $edge_support = $child->get_internal_name; + } else { + $edge_support = undef; + } + + # create an edge record between parent and child + my @values = ("$parent_id", "$child_id", "$branch_length", "$edge_support"); + $dbh->do( "INSERT INTO edges (parent_id, child_id, edge_length, edge_support) VALUES (?, ?, ?, ?) ", undef, @values); + + walktree( $child, $tree_id, $child_id ); + } + + $setright->execute($ctr++, $parent_id); +} + +# Remove nexus tokenization +#============================================================== +sub detokenize { + my $token = shift; + + $token =~ s/_/ /g; + $token =~ s/^\'//g; + $token =~ s/\'$//g; + $token =~ s/''/'/g; + + return($token); +} + +# Compute the transitive closure +#============================================================== +sub compute_tc { + my $tree_id = shift; + + $deletepaths->execute($tree_id); + $initialize_paths->execute($tree_id); + + my $dist = 1; + my $rv = 1; + while ($rv > 0) { + $rv = $calc_paths->execute($tree_id, $dist); + $dist++; + } +} + +# Connect to Postgres using DBI +#============================================================== +sub ConnectToPg { + + my ($cstr, $user, $pass) = @_; + + $cstr = "DBI:Pg:dbname="."$cstr"; + #$cstr .= ";host=10.9.1.1"; + + my $dbh = DBI->connect($cstr, $user, $pass, {PrintError => 1, RaiseError => 1}); + $dbh || &error("DBI connect failed : ",$dbh->errstr); + + return($dbh); +} + +# Get the auto-added id +#============================================================== +sub last_insert_id { + my($dbh, $sequence_name) = @_; + + my $driver = $dbh->{Driver}->{Name}; + if (lc($driver) eq 'mysql') { + return $dbh->{'mysql_insertid'}; + } else { + return $dbh->last_insert_id(undef,undef,undef,undef,{sequence=>'$sequence_name'}); + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |