From: <all...@us...> - 2003-12-19 08:11:27
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv22846/bin Modified Files: ucsc_genes2gff.pl Log Message: overhaul to simplify the script. i'm running into some strangeness though where multiple GB identifiers map to a single protein sequence and mrna GB id. this doesn't happen for mrna GB identifiers and associated sequence... brian, any idea what's going on here? added pfam, affy u133, and affy u95 parsing Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ucsc_genes2gff.pl 19 Dec 2003 06:49:39 -0000 1.5 --- ucsc_genes2gff.pl 19 Dec 2003 08:11:24 -0000 1.6 *************** *** 8,11 **** --- 8,12 ---- use URI::Escape; use Text::Wrap; + $Text::Wrap::columns = 79; use Bio::SeqIO; use Bio::SeqFeature::Generic; *************** *** 44,60 **** my $KNOWNGENEMRNA = $ANNOTATIONS.'/knownGeneMrna.txt'; my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt'; my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; my $LOCACC = $ANNOTATIONS.'/loc2acc'; ! my $mrna2protein = parseGenbank($GENBANK); ! my $kgxref = parseKgXref($KGXREF); ! my $loc2acc = parseLocAcc($LOCACC); # the best way I've found so far to link Genbank mRNA accession to Genbank protein accession ! my $knowngenepep = parseKnownGenePep($KNOWNGENEPEP); ! my $knowngenemrna = parseKnownGeneMrna($KNOWNGENEMRNA); ! my $knownlocuslink = parseKnownLocusLink($KNOWNLOCUSLINK); # need to pull in the omim and other annotations too print "##gff-version 3\n"; ! open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { --- 45,70 ---- my $KNOWNGENEMRNA = $ANNOTATIONS.'/knownGeneMrna.txt'; my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt'; + my $KNOWNPFAM = $ANNOTATIONS.'/knownToPfam.txt'; + my $KNOWNU133 = $ANNOTATIONS.'/knownToU133.txt'; + my $KNOWNU95 = $ANNOTATIONS.'/knownToU95.txt'; my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; my $LOCACC = $ANNOTATIONS.'/loc2acc'; ! my %xref; ! ! parseGenbank(\%xref,$GENBANK); ! parseKnownGenePep(\%xref,$KNOWNGENEPEP); ! parseKnownGeneMrna(\%xref,$KNOWNGENEMRNA); ! parseKgXref(\%xref,$KGXREF); ! parseLocAcc(\%xref,$LOCACC); # the best way I've found so far ! # to link Genbank mRNA accession to ! # Genbank protein accession ! parseKnownLocusLink(\%xref,$KNOWNLOCUSLINK); ! parseKnownAffy(\%xref,$KNOWNU133,$KNOWNU95); ! parseKnownPfam(\%xref,$KNOWNPFAM); # need to pull in the omim and other annotations too print "##gff-version 3\n"; ! if(0){ open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { *************** *** 75,96 **** # print the transcript print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;"); ! if(defined($kgxref->{$id})) { ! foreach my $annotation_set (keys %{$kgxref->{$id}}) { ! print "$annotation_set=". join (",", keys %{$kgxref->{$id}->{$annotation_set}}) .';'; ! ! } ! if(defined $knownlocuslink->{$id}) { ! print "db:locus=", $knownlocuslink->{$id}; } - print "\n"; ! # now write out stuff for protein ! ! my @protGenBank = keys (%{$kgxref->{$id}->{'db:genbank:protein'}}); ! my $protGenBank = $protGenBank[0]; ! ! if(defined ($protGenBank)) { ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.',"ID=$protGenBank;Parent=$id"), "\n"; } } --- 85,100 ---- # print the transcript print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;"); ! if(defined($xref{$id})) { ! foreach my $annotation_set (map {($_ !~ /^sequence/) ? $_ : undef} keys %{$xref{$id}}) { ! next unless $annotation_set; ! print "$annotation_set=". join(",", keys %{$xref{$id}{$annotation_set}}) .';'; } print "\n"; ! foreach my $annotation_set (map {($_ !~ /^sequence/ and $_ =~ /genbank:protein/) ? $_ : undef} keys %{$xref{$id}}){ ! next unless $annotation_set; ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.', ! "ID=". join(",", keys %{$xref{$id}{$annotation_set}}) .";Parent=$id" ! ), "\n"; } } *************** *** 209,229 **** } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; ! # for protein/mrna printing ! if(defined($kgxref) && (defined($knowngenepep) || defined($knowngenemrna))) { ! print "##FASTA\n"; ! # now print all my lovely protein sequence ! foreach my $protein (keys %{$knowngenepep}) { ! print ">".$protein."\n"; ! $Text::Wrap::columns = 79; ! print wrap('', '', $knowngenepep->{$protein}); ! print "\n"; } ! # now all the mRNA sequence ! foreach my $mrna (keys %{$knowngenemrna}) { ! print ">".$mrna."\n"; ! $Text::Wrap::columns = 79; ! print wrap('', '', $knowngenemrna->{$mrna}); ! print "\n"; } } --- 213,232 ---- } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; + } ! print "##FASTA\n"; ! ! foreach my $kg (keys %xref){ ! my $seq_mrna = $xref{$kg}{'sequence:mrna'}; ! my $seq_prot = $xref{$kg}{'sequence:protein'}; ! ! if($seq_mrna){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:mrna'}})) ."\n"; ! print wrap('','',$seq_mrna) ."\n"; } ! ! if($seq_prot){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:protein'}})) ."\n"; ! print wrap('','',$seq_prot) ."\n"; } } *************** *** 241,246 **** sub parseLocAcc { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 244,248 ---- sub parseLocAcc { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 252,259 **** $line[4] =~ /(.*)\.\d/; my $protein = $1; ! if($protein ne '-') { $annotations->{$gene} = $protein; } } close ANNFILE; - return ($annotations); } --- 254,260 ---- $line[4] =~ /(.*)\.\d/; my $protein = $1; ! if($protein ne '-') { $xref->{$gene}{'db:genbank:protein'}{$protein} = 1; } } close ANNFILE; } *************** *** 271,276 **** sub parseKnownLocusLink { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 272,276 ---- sub parseKnownLocusLink { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 278,285 **** next if /^#/; my ($accession,$locuslink) = split /\t/; ! $annotations->{$accession} = $locuslink; } close ANNFILE; ! return ($annotations); } --- 278,334 ---- next if /^#/; my ($accession,$locuslink) = split /\t/; ! $xref->{$accession}{'db:locuslink'}{$locuslink} = 1; } close ANNFILE; ! } ! ! =head2 parseKnownPfam ! ! Title : parseKnownPfam ! Usage : ! Function: ! Example : ! Returns : ! Args : ! ! ! =cut ! ! sub parseKnownPfam { ! my($xref,$filename) = @_; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($accession,$pfam) = split /\t/; ! $xref->{$accession}{'db:pfam'}{$pfam} = 1; ! } ! close ANNFILE; ! } ! ! =head2 parseKnownAffy ! ! Title : parseKnownAffy ! Usage : ! Function: ! Example : ! Returns : ! Args : ! ! ! =cut ! ! sub parseKnownAffy { ! my $xref = shift @_; ! foreach my $filename (@_){ ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($accession,$probeset) = split /\t/; ! $xref->{$accession}{'db:affy'}{$probeset} = 1; ! } ! close ANNFILE; ! } } *************** *** 297,302 **** sub parseKnownGeneMrna { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 346,350 ---- sub parseKnownGeneMrna { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 304,311 **** next if /^#/; my ($accession,$sequence) = split /\t/; ! $annotations->{$accession} = $sequence; } close ANNFILE; - return ($annotations); } --- 352,358 ---- next if /^#/; my ($accession,$sequence) = split /\t/; ! $xref->{$accession}{'sequence:mrna'} = $sequence; } close ANNFILE; } *************** *** 324,329 **** sub parseKnownGenePep { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 371,375 ---- sub parseKnownGenePep { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 331,339 **** next if /^#/; my ($accession,$sequence) = split /\t/; ! my $protGenbankId = $mrna2protein->{$accession}; ! $annotations->{$protGenbankId} = $sequence; } close ANNFILE; - return ($annotations); } --- 377,385 ---- next if /^#/; my ($accession,$sequence) = split /\t/; ! ! $xref->{$accession}{'sequence:protein'} = $sequence; ! # $xref->{$protGenbankId}{'sequence:protein'} = $sequence; } close ANNFILE; } *************** *** 341,345 **** Title : mrna2protein ! Usage : creates a hash between the mRNA genbank accession (used in UCSC DB to key everything) and the proper genbank protein accession Function: Example : --- 387,393 ---- Title : mrna2protein ! Usage : creates a hash between the mRNA genbank accession ! (used in UCSC DB to key everything) and the proper ! genbank protein accession Function: Example : *************** *** 351,365 **** sub parseGenbank { ! my $file = shift; ! my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $file or die "Can't open file $file: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($mrna, $prot) = split /\t/; ! $annotations->{$mrna} = $prot; ! } ! close ANNFILE; ! return($annotations); } --- 399,415 ---- sub parseGenbank { ! my($xref,$filename) = @_; ! # my $file = shift; ! # my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($mrna, $prot) = split /\t/; ! $xref->{$mrna}{'db:genbank:protein'}{$prot} = 1; ! # $annotations->{$mrna} = $prot; ! } ! close ANNFILE; ! # return($annotations); } *************** *** 379,385 **** sub parseKgXref { ! my $filename = shift; ! my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; --- 429,435 ---- sub parseKgXref { ! my($xref,$filename) = @_; ! ! open(ANNFILE, $filename) or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; *************** *** 394,411 **** $description = uri_escape($description); ! my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! $annotations->{$key}->{'db:genbank:protein'}->{$protAccession} = 1 if $protAccession; ! $annotations->{$key}->{'db:genbank:mrna'}->{$kgID} = 1 if $kgID; ! $annotations->{$key}->{'db:genbank:mrna'}->{$mRNA} = 1 if $mRNA; ! $annotations->{$key}->{'db:swissprot'}->{$spID} = 1 if $spID; ! $annotations->{$key}->{'db:swissprot:display'}->{$spDisplayID} = 1 if $spDisplayID; ! $annotations->{$key}->{'genesymbol'}->{$geneSymbol} = 1 if $geneSymbol; ! $annotations->{$key}->{'db:refseq:mrna'}->{$refseq} = 1 if $refseq; ! $annotations->{$key}->{'db:refseq:protein'}->{$protAcc} = 1 if $protAcc; ! $annotations->{$key}->{'description'}->{$description} = 1 if $description; } ! close ANNFILE; ! return($annotations); } --- 444,460 ---- $description = uri_escape($description); ! # my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! # $xref->{$key}{'db:genbank:protein'}{$protAccession} = 1 if $protAccession; ! $xref->{$key}{'db:genbank:mrna'}{$kgID} = 1 if $kgID; ! $xref->{$key}{'db:genbank:mrna'}{$mRNA} = 1 if $mRNA; ! $xref->{$key}{'db:swissprot'}{$spID} = 1 if $spID; ! $xref->{$key}{'db:swissprot:display'}{$spDisplayID} = 1 if $spDisplayID; ! $xref->{$key}{'genesymbol'}{$geneSymbol} = 1 if $geneSymbol; ! $xref->{$key}{'db:refseq:mrna'}{$refseq} = 1 if $refseq; ! $xref->{$key}{'db:refseq:protein'}{$protAcc} = 1 if $protAcc; ! $xref->{$key}{'description'}{$description} = 1 if $description; } ! close(ANNFILE); } |