[Gmod-schema-cmts] schema/chado/bin ucsc_genes2gff.pl,1.5,1.6

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/gmod/schema/chado/bin
In directory sc8-pr-cvs1:/tmp/cvs-serv22846/bin

Modified Files:
	ucsc_genes2gff.pl 
Log Message:
overhaul to simplify the script.  i'm running into some strangeness though
where multiple GB identifiers map to a single protein sequence and mrna GB
id.  this doesn't happen for mrna GB identifiers and associated sequence...
brian, any idea what's going on here?

added pfam, affy u133, and affy u95 parsing

Index: ucsc_genes2gff.pl
===================================================================
RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** ucsc_genes2gff.pl	19 Dec 2003 06:49:39 -0000	1.5
--- ucsc_genes2gff.pl	19 Dec 2003 08:11:24 -0000	1.6
***************
*** 8,11 ****
--- 8,12 ----
  use URI::Escape;
  use Text::Wrap;
+ $Text::Wrap::columns = 79;
  use Bio::SeqIO;
  use Bio::SeqFeature::Generic;
***************
*** 44,60 ****
  my $KNOWNGENEMRNA  = $ANNOTATIONS.'/knownGeneMrna.txt';
  my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt';
  my $GENBANK        = $ANNOTATIONS.'/genbank2accessions.txt';
  my $LOCACC         = $ANNOTATIONS.'/loc2acc';

! my $mrna2protein = parseGenbank($GENBANK);
! my $kgxref = parseKgXref($KGXREF);
! my $loc2acc = parseLocAcc($LOCACC); # the best way I've found so far to link Genbank mRNA accession to Genbank protein accession
! my $knowngenepep = parseKnownGenePep($KNOWNGENEPEP);
! my $knowngenemrna = parseKnownGeneMrna($KNOWNGENEMRNA);
! my $knownlocuslink = parseKnownLocusLink($KNOWNLOCUSLINK);
  # need to pull in the omim and other annotations too

  print "##gff-version 3\n";
! 
  open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!";
  while (<KG>) {
--- 45,70 ----
  my $KNOWNGENEMRNA  = $ANNOTATIONS.'/knownGeneMrna.txt';
  my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt';
+ my $KNOWNPFAM      = $ANNOTATIONS.'/knownToPfam.txt';
+ my $KNOWNU133      = $ANNOTATIONS.'/knownToU133.txt';
+ my $KNOWNU95       = $ANNOTATIONS.'/knownToU95.txt';
  my $GENBANK        = $ANNOTATIONS.'/genbank2accessions.txt';
  my $LOCACC         = $ANNOTATIONS.'/loc2acc';

! my %xref;
! 
! parseGenbank(\%xref,$GENBANK);
! parseKnownGenePep(\%xref,$KNOWNGENEPEP);
! parseKnownGeneMrna(\%xref,$KNOWNGENEMRNA);
! parseKgXref(\%xref,$KGXREF);
! parseLocAcc(\%xref,$LOCACC); # the best way I've found so far
!                                                   # to link Genbank mRNA accession to
!                                                   # Genbank protein accession
! parseKnownLocusLink(\%xref,$KNOWNLOCUSLINK);
! parseKnownAffy(\%xref,$KNOWNU133,$KNOWNU95);
! parseKnownPfam(\%xref,$KNOWNPFAM);
  # need to pull in the omim and other annotations too

  print "##gff-version 3\n";
! if(0){
  open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!";
  while (<KG>) {
***************
*** 75,96 ****
    # print the transcript
    print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;");
!   if(defined($kgxref->{$id})) {
!     foreach my $annotation_set (keys %{$kgxref->{$id}}) {
!       print "$annotation_set=". join (",", keys %{$kgxref->{$id}->{$annotation_set}}) .';';
! 		 
!     }
!     if(defined $knownlocuslink->{$id}) {
!       print "db:locus=", $knownlocuslink->{$id};
      }
- 
      print "\n";

!     # now write out stuff for protein
! 
!     my @protGenBank = keys (%{$kgxref->{$id}->{'db:genbank:protein'}});
!     my $protGenBank = $protGenBank[0];
! 
!     if(defined ($protGenBank)) {
!       print join ("\t",'.', $SRCDB,'protein','.','.','.','.',"ID=$protGenBank;Parent=$id"), "\n";
      }
    }
--- 85,100 ----
    # print the transcript
    print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;");
!   if(defined($xref{$id})) {
!     foreach my $annotation_set (map {($_ !~ /^sequence/) ? $_ : undef} keys %{$xref{$id}}) {
!       next unless $annotation_set;
!       print "$annotation_set=". join(",", keys %{$xref{$id}{$annotation_set}}) .';';
      }
      print "\n";

!     foreach my $annotation_set (map {($_ !~ /^sequence/ and $_ =~ /genbank:protein/) ? $_ : undef} keys %{$xref{$id}}){
!       next unless $annotation_set;
!       print join ("\t",'.', $SRCDB,'protein','.','.','.','.',
!                   "ID=". join(",", keys %{$xref{$id}{$annotation_set}}) .";Parent=$id"
!                  ), "\n";
      }
    }
***************
*** 209,229 ****
  }
  close(KG) or die "couldn't close('$KNOWNGENE'): $!";

! # for protein/mrna printing
! if(defined($kgxref) && (defined($knowngenepep) || defined($knowngenemrna))) {
!   print "##FASTA\n";
!   # now print all my lovely protein sequence
!   foreach my $protein (keys %{$knowngenepep}) {
! 	print ">".$protein."\n";
! 	$Text::Wrap::columns = 79;
! 	print wrap('', '', $knowngenepep->{$protein});
! 	print "\n";
    }
!   # now all the mRNA sequence
!   foreach my $mrna (keys %{$knowngenemrna}) {
! 	print ">".$mrna."\n";
! 	$Text::Wrap::columns = 79;
! 	print wrap('', '', $knowngenemrna->{$mrna});
! 	print "\n";
    }
  }
--- 213,232 ----
  }
  close(KG) or die "couldn't close('$KNOWNGENE'): $!";
+ }

! print "##FASTA\n";
! 
! foreach my $kg (keys %xref){
!   my $seq_mrna = $xref{$kg}{'sequence:mrna'};
!   my $seq_prot = $xref{$kg}{'sequence:protein'};
! 
!   if($seq_mrna){
!     print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:mrna'}})) ."\n";
!     print wrap('','',$seq_mrna) ."\n";
    }
! 
!   if($seq_prot){
!     print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:protein'}})) ."\n";
!     print wrap('','',$seq_prot) ."\n";
    }
  }
***************
*** 241,246 ****

  sub parseLocAcc {
!   my $filename = shift;
!   my $annotations = {};
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
--- 244,248 ----

  sub parseLocAcc {
!   my($xref,$filename) = @_;
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
***************
*** 252,259 ****
  	$line[4] =~ /(.*)\.\d/;
  	my $protein = $1;
! 	if($protein ne '-') { $annotations->{$gene} = $protein; }
    }
    close ANNFILE;
-   return ($annotations);
  }

--- 254,260 ----
  	$line[4] =~ /(.*)\.\d/;
  	my $protein = $1;
! 	if($protein ne '-') { $xref->{$gene}{'db:genbank:protein'}{$protein} = 1; }
    }
    close ANNFILE;
  }

***************
*** 271,276 ****

  sub parseKnownLocusLink {
!   my $filename = shift;
!   my $annotations = {};
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
--- 272,276 ----

  sub parseKnownLocusLink {
!   my($xref,$filename) = @_;
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
***************
*** 278,285 ****
      next if /^#/;
  	my ($accession,$locuslink) = split /\t/;
! 	$annotations->{$accession} = $locuslink;
    }
    close ANNFILE;
!   return ($annotations);
  }

--- 278,334 ----
      next if /^#/;
  	my ($accession,$locuslink) = split /\t/;
! 	$xref->{$accession}{'db:locuslink'}{$locuslink} = 1;
    }
    close ANNFILE;
! }
! 
! =head2 parseKnownPfam
! 
!  Title   : parseKnownPfam
!  Usage   :
!  Function:
!  Example :
!  Returns : 
!  Args    :
! 
! 
! =cut
! 
! sub parseKnownPfam {
!   my($xref,$filename) = @_;
!   open  ANNFILE, $filename or die "Can't open file $filename: $!";
!   while(<ANNFILE>) {
!     chomp;
!     next if /^#/;
! 	my ($accession,$pfam) = split /\t/;
! 	$xref->{$accession}{'db:pfam'}{$pfam} = 1;
!   }
!   close ANNFILE;
! }
! 
! =head2 parseKnownAffy
! 
!  Title   : parseKnownAffy
!  Usage   :
!  Function:
!  Example :
!  Returns : 
!  Args    :
! 
! 
! =cut
! 
! sub parseKnownAffy {
!   my $xref = shift @_;
!   foreach my $filename (@_){
!     open  ANNFILE, $filename or die "Can't open file $filename: $!";
!     while(<ANNFILE>) {
!       chomp;
!       next if /^#/;
!       my ($accession,$probeset) = split /\t/;
!       $xref->{$accession}{'db:affy'}{$probeset} = 1;
!     }
!     close ANNFILE;
!   }
  }

***************
*** 297,302 ****

  sub parseKnownGeneMrna {
!   my $filename = shift;
!   my $annotations = {};
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
--- 346,350 ----

  sub parseKnownGeneMrna {
!   my($xref,$filename) = @_;
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
***************
*** 304,311 ****
      next if /^#/;
  	my ($accession,$sequence) = split /\t/;
! 	$annotations->{$accession} = $sequence;
    }
    close ANNFILE;
-   return ($annotations);
  }

--- 352,358 ----
      next if /^#/;
  	my ($accession,$sequence) = split /\t/;
! 	$xref->{$accession}{'sequence:mrna'} = $sequence;
    }
    close ANNFILE;
  }

***************
*** 324,329 ****

  sub parseKnownGenePep {
!   my $filename = shift;
!   my $annotations = {};
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
--- 371,375 ----

  sub parseKnownGenePep {
!   my($xref,$filename) = @_;
    open  ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
***************
*** 331,339 ****
      next if /^#/;
  	my ($accession,$sequence) = split /\t/;
! 	my $protGenbankId = $mrna2protein->{$accession};
! 	$annotations->{$protGenbankId} = $sequence;
    }
    close ANNFILE;
-   return ($annotations);
  }

--- 377,385 ----
      next if /^#/;
  	my ($accession,$sequence) = split /\t/;
! 
!     $xref->{$accession}{'sequence:protein'} = $sequence;
! #	$xref->{$protGenbankId}{'sequence:protein'} = $sequence;
    }
    close ANNFILE;
  }

***************
*** 341,345 ****

   Title   : mrna2protein
!  Usage   : creates a hash between the mRNA genbank accession (used in UCSC DB to key everything) and the proper genbank protein accession
   Function:
   Example :
--- 387,393 ----

   Title   : mrna2protein
!  Usage   : creates a hash between the mRNA genbank accession
!            (used in UCSC DB to key everything) and the proper
!            genbank protein accession
   Function:
   Example :
***************
*** 351,365 ****

  sub parseGenbank {
!    my $file = shift;
!    my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value
!    open ANNFILE, $file or die "Can't open file $file: $!";
!    while(<ANNFILE>) {
! 	 chomp;
! 	 next if /^#/;
! 	 my ($mrna, $prot) = split /\t/;
! 	 $annotations->{$mrna} = $prot;
!    }
!    close ANNFILE;
!    return($annotations);
  }

--- 399,415 ----

  sub parseGenbank {
!   my($xref,$filename) = @_;
! #  my $file = shift;
! #  my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value
!   open ANNFILE, $filename or die "Can't open file $filename: $!";
!   while(<ANNFILE>) {
!     chomp;
!     next if /^#/;
!     my ($mrna, $prot) = split /\t/;
!     $xref->{$mrna}{'db:genbank:protein'}{$prot} = 1;
! #    $annotations->{$mrna} = $prot;
!   }
!   close ANNFILE;
! #  return($annotations);
  }

***************
*** 379,385 ****

  sub parseKgXref {
!   my $filename = shift;
!   my $annotations = {};
!   open ANNFILE, $filename or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
      chomp;
--- 429,435 ----

  sub parseKgXref {
!   my($xref,$filename) = @_;
! 
!   open(ANNFILE, $filename) or die "Can't open file $filename: $!";
    while(<ANNFILE>) {
      chomp;
***************
*** 394,411 ****
      $description = uri_escape($description);

!     my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession

!     $annotations->{$key}->{'db:genbank:protein'}->{$protAccession} = 1 if $protAccession;
!     $annotations->{$key}->{'db:genbank:mrna'}->{$kgID}             = 1 if $kgID;
!     $annotations->{$key}->{'db:genbank:mrna'}->{$mRNA}             = 1 if $mRNA;
!     $annotations->{$key}->{'db:swissprot'}->{$spID}                = 1 if $spID;
!     $annotations->{$key}->{'db:swissprot:display'}->{$spDisplayID} = 1 if $spDisplayID;
!     $annotations->{$key}->{'genesymbol'}->{$geneSymbol}            = 1 if $geneSymbol;
!     $annotations->{$key}->{'db:refseq:mrna'}->{$refseq}            = 1 if $refseq;
!     $annotations->{$key}->{'db:refseq:protein'}->{$protAcc}        = 1 if $protAcc;
!     $annotations->{$key}->{'description'}->{$description}          = 1 if $description;
    }
!   close ANNFILE;
!   return($annotations);
  }

--- 444,460 ----
      $description = uri_escape($description);

! #    my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession

! #    $xref->{$key}{'db:genbank:protein'}{$protAccession} = 1 if $protAccession;
!     $xref->{$key}{'db:genbank:mrna'}{$kgID}             = 1 if $kgID;
!     $xref->{$key}{'db:genbank:mrna'}{$mRNA}             = 1 if $mRNA;
!     $xref->{$key}{'db:swissprot'}{$spID}                = 1 if $spID;
!     $xref->{$key}{'db:swissprot:display'}{$spDisplayID} = 1 if $spDisplayID;
!     $xref->{$key}{'genesymbol'}{$geneSymbol}            = 1 if $geneSymbol;
!     $xref->{$key}{'db:refseq:mrna'}{$refseq}            = 1 if $refseq;
!     $xref->{$key}{'db:refseq:protein'}{$protAcc}        = 1 if $protAcc;
!     $xref->{$key}{'description'}{$description}          = 1 if $description;
    }
!   close(ANNFILE);
  }