Screenshot instructions:
Windows
Mac
Red Hat Linux
Ubuntu
Click URL instructions:
Right-click on ad, choose "Copy Link", then paste here →
(This may not be possible with some types of ads)
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(19) |
Jun
(119) |
Jul
(21) |
Aug
(7) |
Sep
(2) |
Oct
(126) |
Nov
(85) |
Dec
(138) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(33) |
Feb
(11) |
Mar
(53) |
Apr
(73) |
May
(31) |
Jun
(33) |
Jul
(81) |
Aug
(68) |
Sep
(44) |
Oct
(21) |
Nov
(30) |
Dec
(50) |
2005 |
Jan
(9) |
Feb
(61) |
Mar
(83) |
Apr
(81) |
May
(41) |
Jun
(40) |
Jul
(87) |
Aug
(32) |
Sep
(22) |
Oct
(6) |
Nov
(9) |
Dec
(63) |
2006 |
Jan
(17) |
Feb
(5) |
Mar
(19) |
Apr
(71) |
May
(10) |
Jun
(18) |
Jul
(22) |
Aug
(18) |
Sep
(12) |
Oct
(10) |
Nov
(14) |
Dec
(32) |
2007 |
Jan
(52) |
Feb
(82) |
Mar
(53) |
Apr
(59) |
May
(13) |
Jun
(5) |
Jul
(8) |
Aug
(108) |
Sep
(10) |
Oct
(17) |
Nov
(20) |
Dec
(12) |
2008 |
Jan
(16) |
Feb
(7) |
Mar
(62) |
Apr
(6) |
May
(21) |
Jun
(10) |
Jul
(32) |
Aug
(12) |
Sep
(16) |
Oct
(27) |
Nov
(31) |
Dec
(37) |
2009 |
Jan
(10) |
Feb
(14) |
Mar
(13) |
Apr
(22) |
May
(52) |
Jun
(40) |
Jul
(26) |
Aug
(20) |
Sep
(31) |
Oct
(11) |
Nov
(7) |
Dec
(37) |
2010 |
Jan
(12) |
Feb
(6) |
Mar
(8) |
Apr
(25) |
May
(20) |
Jun
(23) |
Jul
(4) |
Aug
(20) |
Sep
(7) |
Oct
(11) |
Nov
(21) |
Dec
|
2011 |
Jan
(1) |
Feb
(4) |
Mar
(5) |
Apr
(4) |
May
(1) |
Jun
(10) |
Jul
(4) |
Aug
(20) |
Sep
(20) |
Oct
(4) |
Nov
(16) |
Dec
(1) |
2012 |
Jan
(1) |
Feb
(2) |
Mar
(7) |
Apr
(2) |
May
|
Jun
|
Jul
(1) |
Aug
(1) |
Sep
|
Oct
|
Nov
|
Dec
(2) |
2013 |
Jan
(1) |
Feb
(3) |
Mar
(3) |
Apr
|
May
(2) |
Jun
|
Jul
(2) |
Aug
|
Sep
(1) |
Oct
|
Nov
|
Dec
|
2014 |
Jan
|
Feb
|
Mar
|
Apr
(1) |
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(1) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2016 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
(3) |
Nov
|
Dec
|
2017 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
1
|
2
(16) |
3
(16) |
4
(33) |
5
(2) |
6
(1) |
7
|
8
(1) |
9
(6) |
10
(16) |
11
(3) |
12
(3) |
13
(1) |
14
|
15
|
16
|
17
(1) |
18
(11) |
19
(11) |
20
|
21
|
22
(3) |
23
(3) |
24
(6) |
25
(1) |
26
(3) |
27
(1) |
28
|
29
|
30
|
31
|
|
|
|
From: <allenday@us...> - 2003-12-27 00:21:12
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv18055/bin Modified Files: ucsc_genes2gff.pl Log Message: adding support for GO annotations, unigene IDs, OMIM IDs, RefSeq descriptions. Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** ucsc_genes2gff.pl 22 Dec 2003 17:23:07 -0000 1.8 --- ucsc_genes2gff.pl 27 Dec 2003 00:21:09 -0000 1.9 *************** *** 15,23 **** my $executable = basename($0); ! my ($SRCDB,$ORIGIN,$ANNOTATIONS,$CENTER); GetOptions('srcdb:s' => \$SRCDB, 'origin:i' => \$ORIGIN, 'annotations:s' => \$ANNOTATIONS, - 'center:s' => \$CENTER, ) and $ANNOTATIONS or die <<USAGE; Usage: $0 -annotations <dir> [options] --- 15,22 ---- my $executable = basename($0); ! my ($SRCDB,$ORIGIN,$ANNOTATIONS); GetOptions('srcdb:s' => \$SRCDB, 'origin:i' => \$ORIGIN, 'annotations:s' => \$ANNOTATIONS, ) and $ANNOTATIONS or die <<USAGE; Usage: $0 -annotations <dir> [options] *************** *** 38,42 **** $SRCDB ||= 'UCSC'; - my $CENTER ||= 'unigene'; $ORIGIN ||= 1; my $KGXREF = $ANNOTATIONS.'/kgXref.txt'; --- 37,40 ---- *************** *** 50,55 **** --- 48,60 ---- my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; my $LOCACC = $ANNOTATIONS.'/loc2acc'; + my $LOCGO = $ANNOTATIONS.'/loc2go'; + my $LOCUG = $ANNOTATIONS.'/loc2UG'; + my $REFLINK = $ANNOTATIONS.'/refLink.txt'; + my $REFSEQSUMMARY = $ANNOTATIONS.'/refSeqSummary.txt'; + my $CHROMINFO = $ANNOTATIONS.'/chromInfo.txt'; my %xref; + my %loc2mrna; + my %ref2mrna; parseGenbank(\%xref,$GENBANK); *************** *** 60,67 **** # to link Genbank mRNA accession to # Genbank protein accession parseKnownLocusLink(\%xref,$KNOWNLOCUSLINK); parseKnownAffy(\%xref,$KNOWNU133,$KNOWNU95); parseKnownPfam(\%xref,$KNOWNPFAM); ! # need to pull in the omim and other annotations too print "##gff-version 3\n"; --- 65,75 ---- # to link Genbank mRNA accession to # Genbank protein accession + parseLocGo(\%xref,$LOCGO); + parseLocUG(\%xref,$LOCUG); parseKnownLocusLink(\%xref,$KNOWNLOCUSLINK); parseKnownAffy(\%xref,$KNOWNU133,$KNOWNU95); parseKnownPfam(\%xref,$KNOWNPFAM); ! parseRefLink(\%xref,$REFLINK); ! parseRefSeqSummary(\%xref,$REFSEQSUMMARY); print "##gff-version 3\n"; *************** *** 264,267 **** --- 272,277 ---- #my $protein = $1; + my $loc = $line[0]; + my $gene = $line[1]; my $protein = $line[4]; *************** *** 271,274 **** --- 281,286 ---- next if $gene eq 'none'; + push @{ $loc2mrna{$loc} }, $gene; + $xref->{$gene}{'db:genbank:protein'}{$protein} = 1 unless($gene eq 'none' || $protein eq '-'); } *************** *** 276,279 **** --- 288,410 ---- } + =head2 parseLocGo + + Title : parseLocGo + Usage : + Function: + Example : + Returns : + Args : + + =cut + + sub parseLocGo { + my($xref,$filename) = @_; + open ANNFILE, $filename or die "Can't open file $filename: $!"; + while(<ANNFILE>) { + chomp; + next if /^#/; + my @line = split /\t/; + + if($loc2mrna{$line[0]}){ + foreach my $mrna (@{$loc2mrna{$line[0]}}){ + $xref->{$mrna}{'cvterm:go'}{$line[1]} = 1; + } + } + } + close ANNFILE; + } + + =head2 parseLocUG + + Title : parseLocUG + Usage : + Function: + Example : + Returns : + Args : + + =cut + + sub parseLocUG { + my($xref,$filename) = @_; + open ANNFILE, $filename or die "Can't open file $filename: $!"; + while(<ANNFILE>) { + chomp; + next if /^#/; + my @line = split /\t/; + + if($loc2mrna{$line[0]}){ + foreach my $mrna (@{$loc2mrna{$line[0]}}){ + $xref->{$mrna}{'db:unigene'}{$line[1]} = 1; + } + } + } + close ANNFILE; + } + + =head2 parseRefLink + + Title : parseRefLink + Usage : + Function: + Example : + Returns : + Args : + + =cut + + sub parseRefLink { + my($xref,$filename) = @_; + open ANNFILE, $filename or die "Can't open file $filename: $!"; + while(<ANNFILE>) { + chomp; + next if /^#/; + my($symbol,$description,$refmrna,$refprotein,undef,undef,$locus,$omim) = split /\t/; + $description = uri_escape($description); + + if($ref2mrna{$refmrna}){ + foreach my $mrna (@{$ref2mrna{$refmrna}}){ + $xref->{$mrna}{'db:locuslink'}{$locus} = 1; + $xref->{$mrna}{'db:omim'}{$omim} = 1; + $xref->{$mrna}{'db:refseq:mrna'}{$refmrna} = 1; + $xref->{$mrna}{'db:refseq:protein'}{$refprotein} = 1; + $xref->{$mrna}{'Alias'}{$symbol} = 1; + $xref->{$mrna}{'description'}{$description} = 1; + } + } + } + close ANNFILE; + } + + =head2 parseRefSeqSummary + + Title : parseRefSeqSummary + Usage : + Function: + Example : + Returns : + Args : + + =cut + + sub parseRefSeqSummary { + my($xref,$filename) = @_; + open ANNFILE, $filename or die "Can't open file $filename: $!"; + while(<ANNFILE>) { + chomp; + next if /^#/; + my($refmrna,$completeness,$description) = split /\t/; + $description = uri_escape($description); + + if($ref2mrna{$refmrna}){ + foreach my $mrna (@{$ref2mrna{$refmrna}}){ + $xref->{$mrna}{'completeness'}{$completeness} = 1; + $xref->{$mrna}{'description'}{$description} = 1; + } + } + } + close ANNFILE; + } =head2 parseKnownLocusLink *************** *** 457,466 **** # first two are the same (genebank) followed by swissprot etc... my ($kgID, $mRNA, $spID, $spDisplayID, $geneSymbol, $refseq, $protAcc, $description) = split /\t/; ! my $key = ""; ! if($CENTER =~ /unigene/i) { $key = $kgID; } ! else { $key = $refseq; } # escape certain fields $key = uri_escape($key); $description = uri_escape($description); $xref->{$key}{'db:genbank:mrna'}{$kgID} = 1 if $kgID; --- 588,597 ---- # first two are the same (genebank) followed by swissprot etc... my ($kgID, $mRNA, $spID, $spDisplayID, $geneSymbol, $refseq, $protAcc, $description) = split /\t/; ! my $key = $kgID; # escape certain fields $key = uri_escape($key); $description = uri_escape($description); + + push @{ $ref2mrna{$refseq} }, $kgID; $xref->{$key}{'db:genbank:mrna'}{$kgID} = 1 if $kgID; |
From: <allenday@us...> - 2003-12-26 19:47:17
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv6074/load/bin Modified Files: load_affyxls.pl Log Message: doc fix Index: load_affyxls.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_affyxls.pl,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** load_affyxls.pl 24 Dec 2003 00:50:40 -0000 1.12 --- load_affyxls.pl 26 Dec 2003 19:47:12 -0000 1.13 *************** *** 66,70 **** while(my $arrayio = $affx->next_array){ my @txn = (); ! #last unless $arrayio->id; print STDERR "loading array ".$arrayio->id."\n"; --- 66,70 ---- while(my $arrayio = $affx->next_array){ my @txn = (); ! last unless $arrayio->id; print STDERR "loading array ".$arrayio->id."\n"; *************** *** 167,171 **** if(!$chado_cvterm){ my($chado_dbxref) = Chado::Dbxref->search(accession => $cvterm); ! my $fatal = undef;; ($chado_cvterm) = Chado::Cvterm->search(dbxref_id => $chado_dbxref) or $fatal = "couldn't find cvterm for $cvterm, you need to create it"; --- 167,171 ---- if(!$chado_cvterm){ my($chado_dbxref) = Chado::Dbxref->search(accession => $cvterm); ! my $fatal = undef; ($chado_cvterm) = Chado::Cvterm->search(dbxref_id => $chado_dbxref) or $fatal = "couldn't find cvterm for $cvterm, you need to create it"; *************** *** 327,330 **** --- 327,331 ---- next unless $1; $cvterm = $2 ? "$1:$2" : $1; + $cvterm =~ s/:+/:/g while $cvterm =~ /::/; $cvterm{$cvterm} = $val; } |
From: <allenday@us...> - 2003-12-26 19:47:16
|
Update of /cvsroot/gmod/schema/chado In directory sc8-pr-cvs1:/tmp/cvs-serv6074 Modified Files: Makefile.PL Log Message: doc fix Index: Makefile.PL =================================================================== RCS file: /cvsroot/gmod/schema/chado/Makefile.PL,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** Makefile.PL 18 Dec 2003 17:07:31 -0000 1.48 --- Makefile.PL 26 Dec 2003 19:47:12 -0000 1.49 *************** *** 72,76 **** e.g., perl Makefile.PL DBDRIVER=PostgreSQL DBNAME=chado [...] ! Optionally, if the file "load.conf" can be found from a previous run of the Makefile.PL, you will be prompted to confirm the values saved in this file. --- 72,76 ---- e.g., perl Makefile.PL DBDRIVER=PostgreSQL DBNAME=chado [...] ! Optionally, if the file "build.conf" can be found from a previous run of the Makefile.PL, you will be prompted to confirm the values saved in this file. |
From: <allenday@us...> - 2003-12-26 06:47:05
|
Update of /cvsroot/gmod/schema/chado/modules/expression In directory sc8-pr-cvs1:/tmp/cvs-serv1872/modules/expression Modified Files: rad.sql Log Message: adding unique constraint on biomaterialprop annotations and optional values. not sure why this wasn't here before... Index: rad.sql =================================================================== RCS file: /cvsroot/gmod/schema/chado/modules/expression/rad.sql,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** rad.sql 10 Dec 2003 19:58:21 -0000 1.17 --- rad.sql 26 Dec 2003 06:47:02 -0000 1.18 *************** *** 205,209 **** type_id int not null, foreign key (type_id) references cvterm (cvterm_id) on delete cascade, ! value varchar(100) null ); create index biomaterialprop_idx1 on biomaterialprop (biomaterial_id); --- 205,210 ---- type_id int not null, foreign key (type_id) references cvterm (cvterm_id) on delete cascade, ! value varchar(100) null, ! unique(biomaterial_id,type_id,value) ); create index biomaterialprop_idx1 on biomaterialprop (biomaterial_id); |
From: <allenday@us...> - 2003-12-25 01:14:12
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv23110/load/bin Modified Files: load_gff3.PLS Log Message: more suitable for systems with smaller memory. my 512MB desktop was crashing with 100MB shreds. undef dna asap to free memory before doing commit. Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** load_gff3.PLS 24 Dec 2003 21:08:34 -0000 1.10 --- load_gff3.PLS 25 Dec 2003 01:14:09 -0000 1.11 *************** *** 615,619 **** my $dna = $seq->seq; ! my $shredsize = 100_000_000; #don't increase this... my $offset = 0; my $dnalen = length($dna); --- 615,619 ---- my $dna = $seq->seq; ! my $shredsize = 20_000_000; #don't increase this above 100MB... my $offset = 0; my $dnalen = length($dna); *************** *** 625,628 **** --- 625,630 ---- $offset += $shredsize; } + + undef $dna; $f->update; |
From: <allenday@us...> - 2003-12-24 21:08:37
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv14506/load/bin Modified Files: load_gff3.PLS Log Message: invert count Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** load_gff3.PLS 24 Dec 2003 04:48:51 -0000 1.9 --- load_gff3.PLS 24 Dec 2003 21:08:34 -0000 1.10 *************** *** 234,257 **** } ! unless (defined $cvterm{$tag}) { ! $cvterm{$tag} = Chado::Cvterm->find_or_create ({ ! name => $tag, ! cv_id => $cv->id, ! definition => 'auto created by load_gff3.pl' ! }); ! ! push @transaction, $cvterm{$tag}; ! } ! ! my @values = $gff_feature->get_tag_values($tag); ! foreach my $value (@values) { ! my($chado_featureprop) = Chado::Featureprop->find_or_create({ ! feature_id => $chado_feature->id, ! type_id => $cvterm{$tag}->id, ! value => $value ! }); ! ! push @transaction, $chado_featureprop; ! } } --- 234,257 ---- } ! #unless (defined $cvterm{$tag}) { ! # $cvterm{$tag} = Chado::Cvterm->find_or_create ({ ! # name => $tag, ! # cv_id => $cv->id, ! # definition => 'auto created by load_gff3.pl' ! # }); ! # ! # push @transaction, $cvterm{$tag}; ! #} ! # ! #my @values = $gff_feature->get_tag_values($tag); ! #foreach my $value (@values) { ! # my($chado_featureprop) = Chado::Featureprop->find_or_create({ ! # feature_id => $chado_feature->id, ! # type_id => $cvterm{$tag}->id, ! # value => $value ! # }); ! # ! # push @transaction, $chado_featureprop; ! #} } *************** *** 587,591 **** #count the file lines. we need this to track load progress ! open(WC,"grep -c -v '^>' $TMPFASTA |"); $linecount = <WC>; chomp $linecount; close(WC); --- 587,591 ---- #count the file lines. we need this to track load progress ! open(WC,"grep -c '^>' $TMPFASTA |"); $linecount = <WC>; chomp $linecount; close(WC); |
From: <allenday@us...> - 2003-12-24 07:29:11
|
Update of /cvsroot/gmod/schema/chado In directory sc8-pr-cvs1:/tmp/cvs-serv17481 Modified Files: README.H_sapiens Log Message: script name changes Index: README.H_sapiens =================================================================== RCS file: /cvsroot/gmod/schema/chado/README.H_sapiens,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** README.H_sapiens 24 Dec 2003 07:17:35 -0000 1.5 --- README.H_sapiens 24 Dec 2003 07:29:08 -0000 1.6 *************** *** 36,40 **** directory. ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile Hg16.chromosome.gff If you look at the data loaded into the feature table at this point, --- 36,40 ---- directory. ! ./load/bin/gmod_load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile Hg16.chromosome.gff If you look at the data loaded into the feature table at this point, *************** *** 53,57 **** > foreach i (*.fa) ? echo $i ! ? ./load/bin/load_gff3.pl --srcdb 'DB:ucsc' --organism Human --gfffile $i ? end > --- 53,57 ---- > foreach i (*.fa) ? echo $i ! ? ./load/bin/gmod_load_gff3.pl --srcdb 'DB:ucsc' --organism Human --gfffile $i ? end > *************** *** 67,71 **** Now load the GFF file as: ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:genbank' --cache 10000 --gfffile Hg16.knownGene.gff This will take several hours to complete. --- 67,71 ---- Now load the GFF file as: ! ./load/bin/gmod_load_gff3.pl --organism Human --srcdb 'DB:genbank' --cache 10000 --gfffile Hg16.knownGene.gff This will take several hours to complete. *************** *** 78,83 **** and load the GFF files as: ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:affy:U133' --cache 10000 --gfffile Hg16.U133.gff ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:affy:U95' --cache 10000 --gfffile Hg16.U95.gff This may take a few hours to complete. --- 78,83 ---- and load the GFF files as: ! ./load/bin/gmod_load_gff3.pl --organism Human --srcdb 'DB:affy:U133' --cache 10000 --gfffile Hg16.U133.gff ! ./load/bin/gmod_load_gff3.pl --organism Human --srcdb 'DB:affy:U95' --cache 10000 --gfffile Hg16.U95.gff This may take a few hours to complete. |
From: <allenday@us...> - 2003-12-24 07:17:38
|
Update of /cvsroot/gmod/schema/chado In directory sc8-pr-cvs1:/tmp/cvs-serv16252 Modified Files: README.H_sapiens Log Message: typo Index: README.H_sapiens =================================================================== RCS file: /cvsroot/gmod/schema/chado/README.H_sapiens,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** README.H_sapiens 23 Dec 2003 22:19:06 -0000 1.4 --- README.H_sapiens 24 Dec 2003 07:17:35 -0000 1.5 *************** *** 31,40 **** the chromosomes given in UCSC Build 16 (hg16) of the Human genome. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.chromosomes.gff Load the GFF file as below. Make sure you are in your chado distribution directory. ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile Hg16.chromosomes.gff If you look at the data loaded into the feature table at this point, --- 31,40 ---- the chromosomes given in UCSC Build 16 (hg16) of the Human genome. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.chromosome.gff Load the GFF file as below. Make sure you are in your chado distribution directory. ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile Hg16.chromosome.gff If you look at the data loaded into the feature table at this point, |
From: <allenday@us...> - 2003-12-24 04:48:55
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv30372/load/bin Modified Files: load_gff3.PLS Log Message: don't be overly zealous in the creation of entries in synonym and feature_synonym Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** load_gff3.PLS 22 Dec 2003 19:53:18 -0000 1.8 --- load_gff3.PLS 24 Dec 2003 04:48:51 -0000 1.9 *************** *** 340,359 **** my $id = shift; ! if($gff_feature->has_tag('ID')){ ! my($chado_synonym1) = Chado::Synonym->find_or_create({ ! name => $id, ! synonym_sgml => $id, ! type_id => $cvterm{synonym}->id ! }); ! ! my($chado_synonym2) = Chado::Feature_Synonym->find_or_create ({ ! synonym_id => $chado_synonym1->id, ! feature_id => $chado_feature->id, ! pub_id => $pub->id ! }); ! ! push @transaction, $chado_synonym1; ! push @transaction, $chado_synonym2; ! } } --- 340,367 ---- my $id = shift; ! #i don't think this cost of clogging up ! #the synonym table is worth making searches ! #easier. ! # ! #if you want a synonym, use an Alias tag. ! #searches need to look in two places ! #(feature and synonym) for primary and ! #secondary IDs ! #if($gff_feature->has_tag('ID')){ ! # my($chado_synonym1) = Chado::Synonym->find_or_create({ ! # name => $id, ! # synonym_sgml => $id, ! # type_id => $cvterm{synonym}->id ! # }); ! # ! # my($chado_synonym2) = Chado::Feature_Synonym->find_or_create ({ ! # synonym_id => $chado_synonym1->id, ! # feature_id => $chado_feature->id, ! # pub_id => $pub->id ! # }); ! # ! # push @transaction, $chado_synonym1; ! # push @transaction, $chado_synonym2; ! #} } |
From: <allenday@us...> - 2003-12-24 00:50:49
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv27805/load/bin Modified Files: load_affyxls.pl Log Message: more diagnostic output Index: load_affyxls.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_affyxls.pl,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** load_affyxls.pl 24 Dec 2003 00:15:49 -0000 1.11 --- load_affyxls.pl 24 Dec 2003 00:50:40 -0000 1.12 *************** *** 79,92 **** --- 79,102 ---- #if($arrayio->id =~ /^(\d+)\-(\d+)\-(\S+)/){ if($arrayfile =~ m!/!){ + #has leading directory and cvterms if($arrayfile =~ /^.*\/(\d+)\-(\d+)\-(\S+)/){ $chip_id = $1; $sample_id = $2; $cvterms = $3; + #has leading directory + } elsif($arrayfile =~ /^.*\/(\d+)\-(\d+)/){ + $chip_id = $1; + $sample_id = $2; } } else { + #has cvterms if($arrayfile =~ /^(\d+)\-(\d+)\-(\S+)/){ $chip_id = $1; $sample_id = $2; $cvterms = $3; + #has nothing + } elsif($arrayfile =~ /^(\d+)\-(\d+)/){ + $chip_id = $1; + $sample_id = $2; } } *************** *** 116,132 **** --- 126,149 ---- my($array) = Chado::Array->search(name => $arraytype); ($array) ||= Chado::Array->search(name => 'unknown'); + $LOG->debug("loaded record for array type: ".$array->name); my($nulltype) = Chado::Cvterm->search( name => 'null' ); my($oligo) = Chado::Cvterm->search( name => 'microarray_oligo' ); die "couldn't find ontology term 'microarray_oligo', did you load the Sequence Ontology?" unless ref($oligo); + $LOG->debug("loaded records for generic cvterms"); my($human) = Chado::Organism->search( common_name => 'Human' ); + $LOG->debug("loaded record for organism"); my $operator = Chado::Contact->find_or_create( { name => 'UCLA Microarray Core' }); + $LOG->debug("loaded record for hybridization operator"); my $operator_quantification = Chado::Contact->find_or_create( { name => $ENV{USER} }); + $LOG->debug("loaded record for database operator"); my $analysis = Chado::Analysis->find_or_create({ name => 'keystone normalization', program => 'dChip unix', programversion => '1.0'}); + $LOG->debug("loaded record for normalization algorithm"); my $protocol_assay = Chado::Protocol->find_or_create({ name => 'default assay protocol', type_id => $nulltype }); my $protocol_acquisition = Chado::Protocol->find_or_create({ name => 'default acquisition protocol', type_id => $nulltype }); my $protocol_quantification = Chado::Protocol->find_or_create({ name => 'default quantification protocol', type_id => $nulltype }); + $LOG->debug("loaded records for protocols"); push @txn, $operator; *************** *** 143,146 **** --- 160,164 ---- $newchip++ ; } + $LOG->debug("biomaterial_id: ".$biomaterial->id); push @txn, $biomaterial; *************** *** 177,180 **** --- 195,199 ---- $newchip++; } + $LOG->debug("assay_id: ".$assay->id); push @txn, $assay; *************** *** 313,316 **** --- 332,337 ---- } + #this is a mapping table for legacy annotation IDs based on GUSDB, + #and is only for internal use at UCLA. sub _remap_cvterm { my $cvterm_id = shift; |
From: <allenday@us...> - 2003-12-24 00:15:52
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv21834/load/bin Modified Files: load_affyxls.pl Log Message: asdf Index: load_affyxls.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_affyxls.pl,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** load_affyxls.pl 23 Dec 2003 22:19:06 -0000 1.10 --- load_affyxls.pl 24 Dec 2003 00:15:49 -0000 1.11 *************** *** 235,245 **** if(!$feature){ ! $progress->message("creating feature: ".$featuregroup->id); ! $feature = Chado::Feature->find_or_create({ ! organism_id => $human, ! type_id => $oligo, ! name => $featuregroup->id, ! uniquename => 'Affy:Transcript:HG-'. $arraytype .':'. $featuregroup->id, ! }); $feature{$featuregroup->id}{feature_id} = $feature->id; push @txn, $feature; --- 235,251 ---- if(!$feature){ ! #the feature may exist, but not be linked to an element (ergo array) yet. ! ($feature) = Chado::Feature->search(name => $featuregroup->id); ! ! if(!ref($feature)){ ! $feature = Chado::Feature->find_or_create({ ! organism_id => $human, ! type_id => $oligo, ! name => $featuregroup->id, ! uniquename => 'Affy:Transcript:HG-'. $arraytype .':'. $featuregroup->id, ! }); ! ! $progress->message("creating feature: ".$featuregroup->id); ! } $feature{$featuregroup->id}{feature_id} = $feature->id; push @txn, $feature; *************** *** 247,251 **** if(!$element){ ! $progress->message("creating element for: ".$featuregroup->id); $element = Chado::Element->find_or_create({ feature_id => $feature, --- 253,258 ---- if(!$element){ ! $progress->message("creating element for: ".$featuregroup->id); ! $element = Chado::Element->find_or_create({ feature_id => $feature, |
From: <allenday@us...> - 2003-12-23 22:19:10
|
Update of /cvsroot/gmod/schema/chado/load/etc In directory sc8-pr-cvs1:/tmp/cvs-serv1436/load/etc Modified Files: initialize.sql Log Message: adding details to load affy Hg16 alignments Index: initialize.sql =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/etc/initialize.sql,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** initialize.sql 18 Dec 2003 04:24:18 -0000 1.10 --- initialize.sql 23 Dec 2003 22:19:06 -0000 1.11 *************** *** 16,20 **** insert into cvterm (name,definition,cv_id) values ('photochemical_oligo','in-situ photochemically synthesized oligoes',(select cv_id from cv where name = 'Ad Hoc Ontology')); ! insert into pub (miniref,type_id) values ('null',(select cvterm_id from cvterm where name = 'null')); insert into db (name, contact_id) values ('DB:refseq' ,(select contact_id from contact where name = 'null')); insert into db (name, contact_id) values ('DB:genbank',(select contact_id from contact where name = 'null')); --- 16,20 ---- insert into cvterm (name,definition,cv_id) values ('photochemical_oligo','in-situ photochemically synthesized oligoes',(select cv_id from cv where name = 'Ad Hoc Ontology')); ! insert into pub (miniref,uniquename,type_id) values ('null','null',(select cvterm_id from cvterm where name = 'null')); insert into db (name, contact_id) values ('DB:refseq' ,(select contact_id from contact where name = 'null')); insert into db (name, contact_id) values ('DB:genbank',(select contact_id from contact where name = 'null')); |
From: <allenday@us...> - 2003-12-23 22:19:10
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv1436/load/bin Modified Files: load_affyxls.pl Log Message: adding details to load affy Hg16 alignments Index: load_affyxls.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_affyxls.pl,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** load_affyxls.pl 2 Dec 2003 07:46:32 -0000 1.9 --- load_affyxls.pl 23 Dec 2003 22:19:06 -0000 1.10 *************** *** 119,122 **** --- 119,124 ---- my($nulltype) = Chado::Cvterm->search( name => 'null' ); my($oligo) = Chado::Cvterm->search( name => 'microarray_oligo' ); + die "couldn't find ontology term 'microarray_oligo', did you load the Sequence Ontology?" unless ref($oligo); + my($human) = Chado::Organism->search( common_name => 'Human' ); my $operator = Chado::Contact->find_or_create( { name => 'UCLA Microarray Core' }); *************** *** 165,169 **** my $assay = Chado::Assay->find_or_create({ ! array_id => $array, operator_id => $operator->id, name => $chip_id, --- 167,171 ---- my $assay = Chado::Assay->find_or_create({ ! array_id => $array->id, operator_id => $operator->id, name => $chip_id, *************** *** 233,236 **** --- 235,239 ---- if(!$feature){ + $progress->message("creating feature: ".$featuregroup->id); $feature = Chado::Feature->find_or_create({ organism_id => $human, *************** *** 244,247 **** --- 247,251 ---- if(!$element){ + $progress->message("creating element for: ".$featuregroup->id); $element = Chado::Element->find_or_create({ feature_id => $feature, |
From: <allenday@us...> - 2003-12-23 22:19:09
|
Update of /cvsroot/gmod/schema/chado In directory sc8-pr-cvs1:/tmp/cvs-serv1436 Modified Files: README.H_sapiens Log Message: adding details to load affy Hg16 alignments Index: README.H_sapiens =================================================================== RCS file: /cvsroot/gmod/schema/chado/README.H_sapiens,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** README.H_sapiens 10 Dec 2003 16:04:38 -0000 1.3 --- README.H_sapiens 23 Dec 2003 22:19:06 -0000 1.4 *************** *** 31,40 **** the chromosomes given in UCSC Build 16 (hg16) of the Human genome. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/chromosomes.gff Load the GFF file as below. Make sure you are in your chado distribution directory. ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile chromosomes.gff If you look at the data loaded into the feature table at this point, --- 31,40 ---- the chromosomes given in UCSC Build 16 (hg16) of the Human genome. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.chromosomes.gff Load the GFF file as below. Make sure you are in your chado distribution directory. ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:ucsc' --gfffile Hg16.chromosomes.gff If you look at the data loaded into the feature table at this point, *************** *** 60,73 **** 3. Download gene model data. The following URL provides all GenBank mRNA ! records available from UCSC hg16. Each gene model has extra data attached to it, such as LocusLink ID, SwissProt ID, gene symbols, and so on. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/knownGene.gff Now load the GFF file as: ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:genbank' --cache 10000 --gfffile knownGene.gff This will take several hours to complete. LOAD TRANSCRIPTOMIC DATA --- 60,85 ---- 3. Download gene model data. The following URL provides all GenBank mRNA ! records available from UCSC Hg16. Each gene model has extra data attached to it, such as LocusLink ID, SwissProt ID, gene symbols, and so on. ! wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.knownGene.gff Now load the GFF file as: ! ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:genbank' --cache 10000 --gfffile Hg16.knownGene.gff This will take several hours to complete. + + 4. Download Affymetrix probeset DesignElement alignments to UCSC Hg16. + + wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.U133.gff + wget http://sumo.genetics.ucla.edu/~allenday/chado/Hg16.U95.gff + + and load the GFF files as: + + ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:affy:U133' --cache 10000 --gfffile Hg16.U133.gff + ./load/bin/load_gff3.pl --organism Human --srcdb 'DB:affy:U95' --cache 10000 --gfffile Hg16.U95.gff + + This may take a few hours to complete. LOAD TRANSCRIPTOMIC DATA |
From: <allenday@us...> - 2003-12-22 19:53:22
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv27829/load/bin Modified Files: load_gff3.PLS Log Message: aesthetic tweak Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** load_gff3.PLS 22 Dec 2003 17:28:43 -0000 1.7 --- load_gff3.PLS 22 Dec 2003 19:53:18 -0000 1.8 *************** *** 269,276 **** $gffio->close(); ! print "$feature_count features added\n"; my $seqs_loaded = load_sequences(); ! print "$seqs_loaded sequences added\n"; print "Done\n"; --- 269,276 ---- $gffio->close(); ! print "\n$feature_count features added\n"; my $seqs_loaded = load_sequences(); ! print "\n$seqs_loaded sequences added\n"; print "Done\n"; |
From: <allenday@us...> - 2003-12-22 17:28:48
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv25983/load/bin Modified Files: load_gff3.PLS Log Message: structural overhaul. trying to push code into large subroutines. sectioned off support for custom tag parsing. cleaned up protein feature creation. this is compatible w/ 1.8 of ucsc_genes2gff.pl on HEAD Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** load_gff3.PLS 19 Dec 2003 21:35:40 -0000 1.6 --- load_gff3.PLS 22 Dec 2003 17:28:43 -0000 1.7 *************** *** 22,25 **** --- 22,26 ---- use Getopt::Long; use Term::ProgressBar; + use File::Temp qw(tempfile); $| = 1; *************** *** 124,250 **** =cut [...1198 lines suppressed...] ! $featureloc_rank{$chado_feature->id}++; ! return($chado_feature); } ! sub cache_cvterm { ! my $name = shift; ! ! ($cvterm{$name}) = Chado::Cvterm->search(name => $name) || Chado::Cvterm->search(name => ucfirst($name)); ! $cvterm{$name} = $cvterm{$name}->next() if defined($cvterm{$name}) and $cvterm{$name}->isa('Class::DBI::Iterator'); ! unless($cvterm{$name}){ ! ($cvterm{$name}) = Chado::Cvterm->find_or_create({ ! name => $name, ! cv_id => $cv->id, ! definition => 'autocreated by gmod_load_gff3.pl', ! }); ! } ! die "unable to create a '$name' entry in the cvterm table" unless $cvterm{$name}; ! } !NO!SUBS! close OUT or die "Can't close $file: $!"; |
From: <allenday@us...> - 2003-12-22 17:23:10
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv24980/bin Modified Files: ucsc_genes2gff.pl Log Message: the GFF produces is compatible with the changes about to be commited for load_gff3.PLS Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** ucsc_genes2gff.pl 19 Dec 2003 08:13:46 -0000 1.7 --- ucsc_genes2gff.pl 22 Dec 2003 17:23:07 -0000 1.8 *************** *** 20,25 **** 'annotations:s' => \$ANNOTATIONS, 'center:s' => \$CENTER, ! ) or die <<USAGE; ! Usage: $0 [options] Convert UCSC Genome Browser-format gene files into GFF3 version files. --- 20,25 ---- 'annotations:s' => \$ANNOTATIONS, 'center:s' => \$CENTER, ! ) and $ANNOTATIONS or die <<USAGE; ! Usage: $0 -annotations <dir> [options] Convert UCSC Genome Browser-format gene files into GFF3 version files. *************** *** 66,70 **** print "##gff-version 3\n"; ! if(1){ #for debugging open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { --- 66,70 ---- print "##gff-version 3\n"; ! #if(1){ #for debugging open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { *************** *** 89,92 **** --- 89,93 ---- next unless $annotation_set; print "$annotation_set=". join(",", keys %{$xref{$id}{$annotation_set}}) .';'; + } print "\n"; *************** *** 94,100 **** foreach my $annotation_set (map {($_ !~ /^sequence/ and $_ =~ /genbank:protein/) ? $_ : undef} keys %{$xref{$id}}){ next unless $annotation_set; ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.', ! "ID=". join(",", keys %{$xref{$id}{$annotation_set}}) .";Parent=$id" ! ), "\n"; } } --- 95,104 ---- foreach my $annotation_set (map {($_ !~ /^sequence/ and $_ =~ /genbank:protein/) ? $_ : undef} keys %{$xref{$id}}){ next unless $annotation_set; ! ! #there may be multiple protein IDs associated with a given mRNA. list them separately. ! #there is nothing in the spec saying only one ID tag per line, but it seems sensible. ! foreach my $i (keys %{$xref{$id}{$annotation_set}}){ ! print join ("\t",'.',$SRCDB,'protein','.','.','.','.','.',"ID=$i;Parent=$id"), "\n"; ! } } } *************** *** 222,232 **** if($seq_mrna){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:mrna'}})) ."\n"; ! print wrap('','',$seq_mrna) ."\n"; } if($seq_prot){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:protein'}})) ."\n"; ! print wrap('','',$seq_prot) ."\n"; } } --- 226,238 ---- if($seq_mrna){ ! foreach my $k (keys %{$xref{$kg}{'db:genbank:mrna'}}){ ! print ">$k\n". wrap('','',$seq_mrna) ."\n"; ! } } if($seq_prot){ ! foreach my $k (keys %{$xref{$kg}{'db:genbank:protein'}}){ ! print ">$k\n". wrap('','',$seq_prot) ."\n"; ! } } } *************** *** 250,258 **** next if /^#/; my @line = split /\t/; ! $line[1] =~ /(.*)\.\d/; ! my $gene = $1; ! $line[4] =~ /(.*)\.\d/; ! my $protein = $1; ! if($protein ne '-') { $xref->{$gene}{'db:genbank:protein'}{$protein} = 1; } } close ANNFILE; --- 256,275 ---- next if /^#/; my @line = split /\t/; ! ! #note: this doesn't work b/c if the second regex doesn't match, $1 is still ! #leftover from the first regex. a better method is given below. ! #$line[1] =~ /(.*)\.\d/; ! #my $gene = $1; ! #$line[4] =~ /(.*)\.\d/; ! #my $protein = $1; ! ! my $gene = $line[1]; ! my $protein = $line[4]; ! $gene =~ s/\.\d+$//; ! $protein =~ s/\.\d+$//; ! ! next if $gene eq 'none'; ! ! $xref->{$gene}{'db:genbank:protein'}{$protein} = 1 unless($gene eq 'none' || $protein eq '-'); } close ANNFILE; *************** *** 321,324 **** --- 338,343 ---- sub parseKnownAffy { my $xref = shift @_; + + my $i = 0; foreach my $filename (@_){ open ANNFILE, $filename or die "Can't open file $filename: $!"; *************** *** 327,333 **** next if /^#/; my ($accession,$probeset) = split /\t/; ! $xref->{$accession}{'db:affy'}{$probeset} = 1; } close ANNFILE; } } --- 346,353 ---- next if /^#/; my ($accession,$probeset) = split /\t/; ! $xref->{$accession}{'db:affy:'.($i?'U95':'U133')}{$probeset} = 1; } close ANNFILE; + $i++; } } *************** *** 448,452 **** $xref->{$key}{'db:swissprot'}{$spID} = 1 if $spID; $xref->{$key}{'db:swissprot:display'}{$spDisplayID} = 1 if $spDisplayID; ! $xref->{$key}{'genesymbol'}{$geneSymbol} = 1 if $geneSymbol; $xref->{$key}{'db:refseq:mrna'}{$refseq} = 1 if $refseq; $xref->{$key}{'db:refseq:protein'}{$protAcc} = 1 if $protAcc; --- 468,472 ---- $xref->{$key}{'db:swissprot'}{$spID} = 1 if $spID; $xref->{$key}{'db:swissprot:display'}{$spDisplayID} = 1 if $spDisplayID; ! $xref->{$key}{'Alias'}{$geneSymbol} = 1 if $geneSymbol; $xref->{$key}{'db:refseq:mrna'}{$refseq} = 1 if $refseq; $xref->{$key}{'db:refseq:protein'}{$protAcc} = 1 if $protAcc; |
From: <boconnor@us...> - 2003-12-19 21:35:43
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv11380 Modified Files: load_gff3.PLS Log Message: beginnings of support for additional annotation tags Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** load_gff3.PLS 19 Dec 2003 17:59:00 -0000 1.5 --- load_gff3.PLS 19 Dec 2003 21:35:40 -0000 1.6 *************** *** 169,174 **** my %typemap; - my %feature = (); my %featureloc_rank = (); --- 169,174 ---- + # find needed cvterm and other pieces of information my %typemap; my %feature = (); my %featureloc_rank = (); *************** *** 225,228 **** --- 225,240 ---- unless $note_type; + my ($develops_from) = Chado::Cvterm->search(name => 'develops_from'); + ($develops_from) = Chado::Cvterm->search(name => 'Develops_From') unless $develops_from; + unless ($develops_from) { + ($develops_from) = Chado::Cvterm->find_or_create({ + name => 'develops_from', + cv_id => $cv_entry->cv_id, + definition => 'auto created by load_gff3.pl' + }); + } + die "Unable to create note type in cvterm table." + unless $develops_from; + my ($pub_type) = Chado::Cvterm->search(name => 'gff_file'); unless ($pub_type) { *************** *** 256,259 **** --- 268,273 ---- unless $pub; + + # creates the features for each gff segment while(my $gff_segment = $gffio->next_segment()) { my ($segment) = Chado::Feature->search({name => $gff_segment->display_id}); *************** *** 283,286 **** --- 297,302 ---- my @transaction; + + # iterate over each feature in the gff while(my $gff_feature = $gffio->next_feature()) { *************** *** 290,293 **** --- 306,310 ---- : $gff_feature->get_tag_values('Parent'); + # look up the feature if ($gff_feature->has_tag('ID') && !($id eq $gff_feature->seq_id) ){ ($srcfeature{$id}) = Chado::Feature->search(name => $gff_feature->seq_id); *************** *** 377,384 **** $chado_feature->dbxref_id($dbxref{$id}) if $gff_feature->has_tag('ID'); # is this the right thing to do here? ! $chado_feature->update; my $frame = $gff_feature->frame eq '.' ? 0 : $gff_feature->frame; my $chado_featureloc = Chado::Featureloc->find_or_create({ feature_id => $chado_feature->id, --- 394,402 ---- $chado_feature->dbxref_id($dbxref{$id}) if $gff_feature->has_tag('ID'); # is this the right thing to do here? ! $chado_feature->update; # flush updates to this feature object my $frame = $gff_feature->frame eq '.' ? 0 : $gff_feature->frame; + # add feature location my $chado_featureloc = Chado::Featureloc->find_or_create({ feature_id => $chado_feature->id, *************** *** 426,431 **** } ! if($gff_feature->has_tag('Alias')) { ! my @aliases = $gff_feature->get_tag_values('Alias'); foreach my $alias (@aliases) { my($chado_synonym1) = Chado::Synonym->find_or_create({ --- 444,580 ---- } ! ! # add in all the various dbxrefs ! # still need to add protein as subfeature ! if($gff_feature->has_tag('spID') or $gff_feature->has_tag('kgID') or $gff_feature->has_tag('refseq') or $gff_feature->has_tag('protAcc')){ #add as dbxref ! ! ##### Don't each of these dbxrefs need to be added to the features???? ! ! my @dbxrefs; ! my $chado_db; ! if $gff_feature->has_tag('spID') { ! $chado_db = Chado::Db->search(name => 'DB:swissprot') else die "Must have db table entry for DB:swissprot\n"; ! @dbxrefs = $gff_feature->get_tag_values('spID'); ! foreach my $dbxref (@dbxrefs) { ! my($chado_dbxref) = Chado::Dbxref->find_or_create({ ! db_id => $chado_db->id, ! accession => $dbxref, ! }); ! my($parentFeature) = $feature{$id}; ! my($feature_dbxref) = Chado::Feature_Dbxref->find_or_create ({ ! feature_id => $parentFeature->id, ! dbxref_id => $chado_dbxref->id, ! }); ! push @transaction, $chado_dbxref, $feature_dbxref; ! } ! } ! if $gff_feature->has_tag('kgID') { ! $chado_db = Chado::Db->search(name => 'DB:genbank') else die "Must have db table entry for DB:genbank\n"; ! @dbxrefs = $gff_feature->get_tag_values('kgID'); ! foreach my $dbxref (@dbxrefs) { ! my($chado_dbxref) = Chado::Dbxref->find_or_create({ ! db_id => $chado_db->id, ! accession => $dbxref, ! }); ! my($parentFeature) = $feature{$id}; ! my($feature_dbxref) = Chado::Feature_Dbxref->find_or_create ({ ! feature_id => $parentFeature->id, ! dbxref_id => $chado_dbxref->id, ! }); ! push @transaction, $chado_dbxref, $feature_dbxref; ! } ! } ! if $gff_feature->has_tag('refseq') { ! $chado_db = Chado::Db->search(name => 'DB:refseq') else die "Must have db table entry for DB:refseq\n"; ! @dbxrefs = $gff_feature->get_tag_values('refseq'); ! foreach my $dbxref (@dbxrefs) { ! my($chado_dbxref) = Chado::Dbxref->find_or_create({ ! db_id => $chado_db->id, ! accession => $dbxref, ! }); ! my($parentFeature) = $feature{$id}; ! my($feature_dbxref) = Chado::Feature_Dbxref->find_or_create ({ ! feature_id => $parentFeature->id, ! dbxref_id => $chado_dbxref->id, ! }); ! push @transaction, $chado_dbxref, $feature_dbxref; ! } ! } ! if $gff_feature->has_tag('protAcc') { ! $chado_db = Chado::Db->search(name => 'DB:refseq') else die "Must have db table entry for DB:refseq\n"; ! @dbxrefs = $gff_feature->get_tag_values('protAcc'); ! foreach my $dbxref (@dbxrefs) { ! my($chado_dbxref) = Chado::Dbxref->find_or_create({ ! db_id => $chado_db->id, ! accession => $dbxref, ! }); ! my($parentFeature) = $feature{$id}; ! my($feature_dbxref) = Chado::Feature_Dbxref->find_or_create ({ ! feature_id => $parentFeature->id, ! dbxref_id => $chado_dbxref->id, ! }); ! push @transaction, $chado_dbxref, $feature_dbxref; ! } ! } ! } ! ! ! # is there something better to use than a 'description' cvterm? ! my ($description_type) = Chado::Cvterm->search(name => 'description'); ! unless ($description_type) { ! ($description_type) = Chado::Cvterm->find_or_create({ ! name => 'description', ! cv_id => $cv_entry->cv_id, ! definition => 'auto created by load_gff3.pl' ! }); ! push @transaction, $description_type; ! } ! ! if($gff_feature->has_tag('description')){ #add as featureprop ! my($chado_featureprop) = Chado::Featureprop->find_or_create({ ! feature_id => $chado_feature->feature_id, ! type_id => $description_type->cvterm_id, ! value => $gff_feature->get_tag_value('description'), ! }); ! push @transaction, $chado_featureprop; ! } ! ! # is this the correct ontology term to use? ! my ($protein_type) = Chado::Cvterm->search(name => 'protein'); ! unless ($protein_type) { ! ($protein_type) = Chado::Cvterm->find_or_create({ ! name => 'protein', ! cv_id => $cv_entry->cv_id, ! definition => 'auto created by load_gff3.pl' ! }); ! push @transaction, $protein_type; ! ! # changed from protAcc to protein ! ! if($gff_feature->has_tag("protein")) { ! my($chado_feature) = Chado::Feature->find_or_create({ ! organism_id => $chado_organism, ! name => $gff_feature->get_tag_values('protein'), ! uniquename => $gff_feature->get_tag_values('protein').'_'. $gff_feature->primary_tag ! .'_'. $gff_feature->seq_id .':' ! . $fmin .'..'. $fmax, ! type_id => $protein_type->cvterm_id, ! }); ! ! ## !!!!!!!!! Need to add in subfeature here!!! ! my $parentFeature = $feature{$id}; # is this pulling the parent feature back here? ! my($chado_feature_rel) = Chado::Feature_Relationship->find_or_create({ ! subject_id => $chado_feature->id, ! object_id => $parentFeature->id, ! type_id => $develops_from->id, ! }); ! ! push @transaction, $chado_feature, $chado_feature_rel; ! } ! ! if($gff_feature->has_tag('Alias') or $gff_feature->has_tag('geneSymbol')) { ! my @aliases; ! if($gff_feature->has_tag('Alias')) { push @aliases, $gff_feature->get_tag_values('Alias'); } ! if($gff_feature->has_tag('geneSymbol')) { push @aliases, $gff_feature->get_tag_values('geneSymbol'); } foreach my $alias (@aliases) { my($chado_synonym1) = Chado::Synonym->find_or_create({ |
From: <peili@us...> - 2003-12-19 19:30:09
|
Update of /cvsroot/gmod/schema/chado/modules/pub In directory sc8-pr-cvs1:/tmp/cvs-serv20448 Modified Files: pub.sql Log Message: put back the lost changes made in revision 1.14 and 1.15, and change the unique key definition: a. add column uniquename text not null b. change miniref to be nullable c. use uniquename and type_id as the unique key, replacing miniref Index: pub.sql =================================================================== RCS file: /cvsroot/gmod/schema/chado/modules/pub/pub.sql,v retrieving revision 1.17 retrieving revision 1.18 diff -C2 -d -r1.17 -r1.18 *** pub.sql 28 Oct 2003 07:02:56 -0000 1.17 --- pub.sql 19 Dec 2003 19:30:06 -0000 1.18 *************** *** 16,20 **** pyear varchar(255), pages varchar(255), ! miniref varchar(255) not null, type_id int not null, foreign key (type_id) references cvterm (cvterm_id) on delete cascade, --- 16,21 ---- pyear varchar(255), pages varchar(255), ! miniref varchar(255), ! uniquename text not null, type_id int not null, foreign key (type_id) references cvterm (cvterm_id) on delete cascade, *************** *** 23,27 **** pubplace varchar(255), ! unique(miniref) ); -- title: title of paper, chapter of book, journal, etc --- 24,28 ---- pubplace varchar(255), ! unique(uniquename, type_id) ); -- title: title of paper, chapter of book, journal, etc |
From: <scottcain@us...> - 2003-12-19 17:59:04
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv4102/load/bin Modified Files: genbank2gff3.PLS load_gff3.PLS Log Message: minor documentation fixes and removing the 'sequence-region' line from genbank2gff Index: genbank2gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/genbank2gff3.PLS,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** genbank2gff3.PLS 19 Dec 2003 15:03:08 -0000 1.3 --- genbank2gff3.PLS 19 Dec 2003 17:59:00 -0000 1.4 *************** *** 66,71 **** my $end = length $seq->seq; print OUT "##gff-version 3\n", ! "##date $date\n", ! "##sequence-region ",join(' ', $sname, 1, $end),"\n"; foreach my $sf (@sfs) { --- 66,72 ---- my $end = length $seq->seq; print OUT "##gff-version 3\n", ! "##date $date\n"; ! ! # "##sequence-region ",join(' ', $sname, 1, $end),"\n"; foreach my $sf (@sfs) { *************** *** 192,195 **** --- 193,197 ---- validate_ID($sf2,$acc); + print OUT $sf2->gff_string . "\n"; *************** *** 212,215 **** --- 214,218 ---- print OUT $sf3->gff_string . "\n"; + } } Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** load_gff3.PLS 19 Dec 2003 00:32:57 -0000 1.4 --- load_gff3.PLS 19 Dec 2003 17:59:00 -0000 1.5 *************** *** 89,95 **** chr1 NCBI chromosome 1 246127941 . . . ID=chr1 ! Note that if the '##sequence-region' notation is used, this script will ! not be able to determine the type of sequence and therefore will ! assign it the 'region' type which is very general. If that is not what you want, use the standard GFF line to specify the reference sequence. --- 89,95 ---- chr1 NCBI chromosome 1 246127941 . . . ID=chr1 ! Do not use both. Note that if the '##sequence-region' notation is used, ! this script will not be able to determine the type of sequence and therefore ! will assign it the 'region' type which is very general. If that is not what you want, use the standard GFF line to specify the reference sequence. *************** *** 137,141 **** $CACHE_SIZE ||= 1000; ! die unless $GFFFILE "\nYou must specify a GFF file\n"; #deal with GFF3 files that contain sequence --- 137,141 ---- $CACHE_SIZE ||= 1000; ! die "\nYou must specify a GFF file\n" unless $GFFFILE; #deal with GFF3 files that contain sequence |
From: <sheldon_mckay@us...> - 2003-12-19 15:03:11
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv28420 Modified Files: genbank2gff3.PLS Log Message: Added a GFF 3 header to the output so it will be parsed correctly by Bio::DB::GFF Index: genbank2gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/genbank2gff3.PLS,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** genbank2gff3.PLS 18 Dec 2003 16:41:58 -0000 1.2 --- genbank2gff3.PLS 19 Dec 2003 15:03:08 -0000 1.3 *************** *** 1,3 **** ! #!perl -w use Config; --- 1,3 ---- ! #!/usr/bin/perl use Config; *************** *** 59,62 **** --- 59,72 ---- #my @top_sfs = $unflattener->get_SeqFeatures; my @skipped_lines; + + + # print a GFF header + my $date = localtime; + my $sname = $seq->accession || $seq->display_name; + my $end = length $seq->seq; + print OUT "##gff-version 3\n", + "##date $date\n", + "##sequence-region ",join(' ', $sname, 1, $end),"\n"; + foreach my $sf (@sfs) { *************** *** 239,243 **** --- 249,256 ---- } } + !NO!SUBS! + + close OUT or die "Can't close $file: $!"; chmod 0755, $file or die "Can't reset permissions for $file: $!\n"; |
From: <allenday@us...> - 2003-12-19 08:13:49
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv23435/bin Modified Files: ucsc_genes2gff.pl Log Message: debug Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** ucsc_genes2gff.pl 19 Dec 2003 08:11:24 -0000 1.6 --- ucsc_genes2gff.pl 19 Dec 2003 08:13:46 -0000 1.7 *************** *** 66,70 **** print "##gff-version 3\n"; ! if(0){ open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { --- 66,70 ---- print "##gff-version 3\n"; ! if(1){ #for debugging open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { *************** *** 213,217 **** } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; ! } print "##FASTA\n"; --- 213,217 ---- } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; ! #} for debugging print "##FASTA\n"; *************** *** 444,450 **** $description = uri_escape($description); - # my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession - - # $xref->{$key}{'db:genbank:protein'}{$protAccession} = 1 if $protAccession; $xref->{$key}{'db:genbank:mrna'}{$kgID} = 1 if $kgID; $xref->{$key}{'db:genbank:mrna'}{$mRNA} = 1 if $mRNA; --- 444,447 ---- |
From: <allenday@us...> - 2003-12-19 08:11:27
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv22846/bin Modified Files: ucsc_genes2gff.pl Log Message: overhaul to simplify the script. i'm running into some strangeness though where multiple GB identifiers map to a single protein sequence and mrna GB id. this doesn't happen for mrna GB identifiers and associated sequence... brian, any idea what's going on here? added pfam, affy u133, and affy u95 parsing Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** ucsc_genes2gff.pl 19 Dec 2003 06:49:39 -0000 1.5 --- ucsc_genes2gff.pl 19 Dec 2003 08:11:24 -0000 1.6 *************** *** 8,11 **** --- 8,12 ---- use URI::Escape; use Text::Wrap; + $Text::Wrap::columns = 79; use Bio::SeqIO; use Bio::SeqFeature::Generic; *************** *** 44,60 **** my $KNOWNGENEMRNA = $ANNOTATIONS.'/knownGeneMrna.txt'; my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt'; my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; my $LOCACC = $ANNOTATIONS.'/loc2acc'; ! my $mrna2protein = parseGenbank($GENBANK); ! my $kgxref = parseKgXref($KGXREF); ! my $loc2acc = parseLocAcc($LOCACC); # the best way I've found so far to link Genbank mRNA accession to Genbank protein accession ! my $knowngenepep = parseKnownGenePep($KNOWNGENEPEP); ! my $knowngenemrna = parseKnownGeneMrna($KNOWNGENEMRNA); ! my $knownlocuslink = parseKnownLocusLink($KNOWNLOCUSLINK); # need to pull in the omim and other annotations too print "##gff-version 3\n"; ! open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { --- 45,70 ---- my $KNOWNGENEMRNA = $ANNOTATIONS.'/knownGeneMrna.txt'; my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt'; + my $KNOWNPFAM = $ANNOTATIONS.'/knownToPfam.txt'; + my $KNOWNU133 = $ANNOTATIONS.'/knownToU133.txt'; + my $KNOWNU95 = $ANNOTATIONS.'/knownToU95.txt'; my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; my $LOCACC = $ANNOTATIONS.'/loc2acc'; ! my %xref; ! ! parseGenbank(\%xref,$GENBANK); ! parseKnownGenePep(\%xref,$KNOWNGENEPEP); ! parseKnownGeneMrna(\%xref,$KNOWNGENEMRNA); ! parseKgXref(\%xref,$KGXREF); ! parseLocAcc(\%xref,$LOCACC); # the best way I've found so far ! # to link Genbank mRNA accession to ! # Genbank protein accession ! parseKnownLocusLink(\%xref,$KNOWNLOCUSLINK); ! parseKnownAffy(\%xref,$KNOWNU133,$KNOWNU95); ! parseKnownPfam(\%xref,$KNOWNPFAM); # need to pull in the omim and other annotations too print "##gff-version 3\n"; ! if(0){ open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; while (<KG>) { *************** *** 75,96 **** # print the transcript print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;"); ! if(defined($kgxref->{$id})) { ! foreach my $annotation_set (keys %{$kgxref->{$id}}) { ! print "$annotation_set=". join (",", keys %{$kgxref->{$id}->{$annotation_set}}) .';'; ! ! } ! if(defined $knownlocuslink->{$id}) { ! print "db:locus=", $knownlocuslink->{$id}; } - print "\n"; ! # now write out stuff for protein ! ! my @protGenBank = keys (%{$kgxref->{$id}->{'db:genbank:protein'}}); ! my $protGenBank = $protGenBank[0]; ! ! if(defined ($protGenBank)) { ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.',"ID=$protGenBank;Parent=$id"), "\n"; } } --- 85,100 ---- # print the transcript print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;"); ! if(defined($xref{$id})) { ! foreach my $annotation_set (map {($_ !~ /^sequence/) ? $_ : undef} keys %{$xref{$id}}) { ! next unless $annotation_set; ! print "$annotation_set=". join(",", keys %{$xref{$id}{$annotation_set}}) .';'; } print "\n"; ! foreach my $annotation_set (map {($_ !~ /^sequence/ and $_ =~ /genbank:protein/) ? $_ : undef} keys %{$xref{$id}}){ ! next unless $annotation_set; ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.', ! "ID=". join(",", keys %{$xref{$id}{$annotation_set}}) .";Parent=$id" ! ), "\n"; } } *************** *** 209,229 **** } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; ! # for protein/mrna printing ! if(defined($kgxref) && (defined($knowngenepep) || defined($knowngenemrna))) { ! print "##FASTA\n"; ! # now print all my lovely protein sequence ! foreach my $protein (keys %{$knowngenepep}) { ! print ">".$protein."\n"; ! $Text::Wrap::columns = 79; ! print wrap('', '', $knowngenepep->{$protein}); ! print "\n"; } ! # now all the mRNA sequence ! foreach my $mrna (keys %{$knowngenemrna}) { ! print ">".$mrna."\n"; ! $Text::Wrap::columns = 79; ! print wrap('', '', $knowngenemrna->{$mrna}); ! print "\n"; } } --- 213,232 ---- } close(KG) or die "couldn't close('$KNOWNGENE'): $!"; + } ! print "##FASTA\n"; ! ! foreach my $kg (keys %xref){ ! my $seq_mrna = $xref{$kg}{'sequence:mrna'}; ! my $seq_prot = $xref{$kg}{'sequence:protein'}; ! ! if($seq_mrna){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:mrna'}})) ."\n"; ! print wrap('','',$seq_mrna) ."\n"; } ! ! if($seq_prot){ ! print '>'. join(',',keys(%{$xref{$kg}{'db:genbank:protein'}})) ."\n"; ! print wrap('','',$seq_prot) ."\n"; } } *************** *** 241,246 **** sub parseLocAcc { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 244,248 ---- sub parseLocAcc { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 252,259 **** $line[4] =~ /(.*)\.\d/; my $protein = $1; ! if($protein ne '-') { $annotations->{$gene} = $protein; } } close ANNFILE; - return ($annotations); } --- 254,260 ---- $line[4] =~ /(.*)\.\d/; my $protein = $1; ! if($protein ne '-') { $xref->{$gene}{'db:genbank:protein'}{$protein} = 1; } } close ANNFILE; } *************** *** 271,276 **** sub parseKnownLocusLink { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 272,276 ---- sub parseKnownLocusLink { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 278,285 **** next if /^#/; my ($accession,$locuslink) = split /\t/; ! $annotations->{$accession} = $locuslink; } close ANNFILE; ! return ($annotations); } --- 278,334 ---- next if /^#/; my ($accession,$locuslink) = split /\t/; ! $xref->{$accession}{'db:locuslink'}{$locuslink} = 1; } close ANNFILE; ! } ! ! =head2 parseKnownPfam ! ! Title : parseKnownPfam ! Usage : ! Function: ! Example : ! Returns : ! Args : ! ! ! =cut ! ! sub parseKnownPfam { ! my($xref,$filename) = @_; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($accession,$pfam) = split /\t/; ! $xref->{$accession}{'db:pfam'}{$pfam} = 1; ! } ! close ANNFILE; ! } ! ! =head2 parseKnownAffy ! ! Title : parseKnownAffy ! Usage : ! Function: ! Example : ! Returns : ! Args : ! ! ! =cut ! ! sub parseKnownAffy { ! my $xref = shift @_; ! foreach my $filename (@_){ ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($accession,$probeset) = split /\t/; ! $xref->{$accession}{'db:affy'}{$probeset} = 1; ! } ! close ANNFILE; ! } } *************** *** 297,302 **** sub parseKnownGeneMrna { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 346,350 ---- sub parseKnownGeneMrna { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 304,311 **** next if /^#/; my ($accession,$sequence) = split /\t/; ! $annotations->{$accession} = $sequence; } close ANNFILE; - return ($annotations); } --- 352,358 ---- next if /^#/; my ($accession,$sequence) = split /\t/; ! $xref->{$accession}{'sequence:mrna'} = $sequence; } close ANNFILE; } *************** *** 324,329 **** sub parseKnownGenePep { ! my $filename = shift; ! my $annotations = {}; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { --- 371,375 ---- sub parseKnownGenePep { ! my($xref,$filename) = @_; open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { *************** *** 331,339 **** next if /^#/; my ($accession,$sequence) = split /\t/; ! my $protGenbankId = $mrna2protein->{$accession}; ! $annotations->{$protGenbankId} = $sequence; } close ANNFILE; - return ($annotations); } --- 377,385 ---- next if /^#/; my ($accession,$sequence) = split /\t/; ! ! $xref->{$accession}{'sequence:protein'} = $sequence; ! # $xref->{$protGenbankId}{'sequence:protein'} = $sequence; } close ANNFILE; } *************** *** 341,345 **** Title : mrna2protein ! Usage : creates a hash between the mRNA genbank accession (used in UCSC DB to key everything) and the proper genbank protein accession Function: Example : --- 387,393 ---- Title : mrna2protein ! Usage : creates a hash between the mRNA genbank accession ! (used in UCSC DB to key everything) and the proper ! genbank protein accession Function: Example : *************** *** 351,365 **** sub parseGenbank { ! my $file = shift; ! my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $file or die "Can't open file $file: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($mrna, $prot) = split /\t/; ! $annotations->{$mrna} = $prot; ! } ! close ANNFILE; ! return($annotations); } --- 399,415 ---- sub parseGenbank { ! my($xref,$filename) = @_; ! # my $file = shift; ! # my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $filename or die "Can't open file $filename: $!"; ! while(<ANNFILE>) { ! chomp; ! next if /^#/; ! my ($mrna, $prot) = split /\t/; ! $xref->{$mrna}{'db:genbank:protein'}{$prot} = 1; ! # $annotations->{$mrna} = $prot; ! } ! close ANNFILE; ! # return($annotations); } *************** *** 379,385 **** sub parseKgXref { ! my $filename = shift; ! my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; --- 429,435 ---- sub parseKgXref { ! my($xref,$filename) = @_; ! ! open(ANNFILE, $filename) or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; *************** *** 394,411 **** $description = uri_escape($description); ! my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! $annotations->{$key}->{'db:genbank:protein'}->{$protAccession} = 1 if $protAccession; ! $annotations->{$key}->{'db:genbank:mrna'}->{$kgID} = 1 if $kgID; ! $annotations->{$key}->{'db:genbank:mrna'}->{$mRNA} = 1 if $mRNA; ! $annotations->{$key}->{'db:swissprot'}->{$spID} = 1 if $spID; ! $annotations->{$key}->{'db:swissprot:display'}->{$spDisplayID} = 1 if $spDisplayID; ! $annotations->{$key}->{'genesymbol'}->{$geneSymbol} = 1 if $geneSymbol; ! $annotations->{$key}->{'db:refseq:mrna'}->{$refseq} = 1 if $refseq; ! $annotations->{$key}->{'db:refseq:protein'}->{$protAcc} = 1 if $protAcc; ! $annotations->{$key}->{'description'}->{$description} = 1 if $description; } ! close ANNFILE; ! return($annotations); } --- 444,460 ---- $description = uri_escape($description); ! # my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! # $xref->{$key}{'db:genbank:protein'}{$protAccession} = 1 if $protAccession; ! $xref->{$key}{'db:genbank:mrna'}{$kgID} = 1 if $kgID; ! $xref->{$key}{'db:genbank:mrna'}{$mRNA} = 1 if $mRNA; ! $xref->{$key}{'db:swissprot'}{$spID} = 1 if $spID; ! $xref->{$key}{'db:swissprot:display'}{$spDisplayID} = 1 if $spDisplayID; ! $xref->{$key}{'genesymbol'}{$geneSymbol} = 1 if $geneSymbol; ! $xref->{$key}{'db:refseq:mrna'}{$refseq} = 1 if $refseq; ! $xref->{$key}{'db:refseq:protein'}{$protAcc} = 1 if $protAcc; ! $xref->{$key}{'description'}{$description} = 1 if $description; } ! close(ANNFILE); } |
From: <allenday@us...> - 2003-12-19 06:49:42
|
Update of /cvsroot/gmod/schema/chado/bin In directory sc8-pr-cvs1:/tmp/cvs-serv12026/bin Modified Files: ucsc_genes2gff.pl Log Message: this is starting to look good Index: ucsc_genes2gff.pl =================================================================== RCS file: /cvsroot/gmod/schema/chado/bin/ucsc_genes2gff.pl,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** ucsc_genes2gff.pl 18 Dec 2003 18:28:00 -0000 1.4 --- ucsc_genes2gff.pl 19 Dec 2003 06:49:39 -0000 1.5 *************** *** 14,29 **** my $executable = basename($0); ! my ($SRC,$ORIGIN,$KNOWNGENEPEP,$KNOWNGENEMRNA,$KNOWNLOCUSLINK,$GENBANK,$KGXREF,$LOCACC,$CENTER); ! GetOptions('src:s' => \$SRC, ! 'origin:i' => \$ORIGIN, ! 'kgXref:s' => \$KGXREF, ! 'knownGenePep:s' => \$KNOWNGENEPEP, ! 'knownGeneMrna:s' => \$KNOWNGENEMRNA, ! 'knownLocusLink:s' => \$KNOWNLOCUSLINK, ! 'genbank:s' => \$GENBANK, ! 'loc2acc:s' => \$LOCACC, ! 'center:s' => \$CENTER, ! ) or die <<USAGE; ! Usage: cat knownGene.txt | $0 [options] Convert UCSC Genome Browser-format gene files into GFF3 version files. --- 14,24 ---- my $executable = basename($0); ! my ($SRCDB,$ORIGIN,$ANNOTATIONS,$CENTER); ! GetOptions('srcdb:s' => \$SRCDB, ! 'origin:i' => \$ORIGIN, ! 'annotations:s' => \$ANNOTATIONS, ! 'center:s' => \$CENTER, ! ) or die <<USAGE; ! Usage: $0 [options] Convert UCSC Genome Browser-format gene files into GFF3 version files. *************** *** 34,53 **** Options: ! -src <string> Choose a source for the gene, default "UCSC" -origin <integer> Choose a relative position to number from, default is "1" ! -annotations <file> Annotations file -center <string> ??? USAGE ! $SRC ||= 'UCSC'; ! $ORIGIN ||= 1; ! $KGXREF ||='kgXref.txt'; ! $KNOWNGENEPEP ||= 'knownGenePep.txt'; ! $KNOWNGENEMRNA ||= 'knownGeneMrna.txt'; ! $KNOWNLOCUSLINK ||= 'knownToLocusLink.txt'; ! $GENBANK ||= 'genbank2accessions.txt'; ! $LOCACC ||= 'loc2acc'; ! $CENTER ||= 'unigene'; my $mrna2protein = parseGenbank($GENBANK); --- 29,49 ---- Options: ! -srcdb <string> Choose a source for the gene, default "UCSC" -origin <integer> Choose a relative position to number from, default is "1" ! -annotations <dir> Directory containing UCSC annotation files -center <string> ??? USAGE ! $SRCDB ||= 'UCSC'; ! my $CENTER ||= 'unigene'; ! $ORIGIN ||= 1; ! my $KGXREF = $ANNOTATIONS.'/kgXref.txt'; ! my $KNOWNGENE = $ANNOTATIONS.'/knownGene.txt'; ! my $KNOWNGENEPEP = $ANNOTATIONS.'/knownGenePep.txt'; ! my $KNOWNGENEMRNA = $ANNOTATIONS.'/knownGeneMrna.txt'; ! my $KNOWNLOCUSLINK = $ANNOTATIONS.'/knownToLocusLink.txt'; ! my $GENBANK = $ANNOTATIONS.'/genbank2accessions.txt'; ! my $LOCACC = $ANNOTATIONS.'/loc2acc'; my $mrna2protein = parseGenbank($GENBANK); *************** *** 61,65 **** print "##gff-version 3\n"; ! while (<>) { chomp; next if /^\#/;; --- 57,62 ---- print "##gff-version 3\n"; ! open(KG,$KNOWNGENE) or die "couldn't open('$KNOWNGENE'): $!"; ! while (<KG>) { chomp; next if /^\#/;; *************** *** 77,101 **** # print the transcript ! print join ("\t",$chrom,$SRC,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id"); ! #print ! if(defined($kgxref->{$id})){ ! foreach my $annotation_set (keys %{$kgxref->{$id}}){ ! print ";"; ! print "$annotation_set=", join (",", keys %{$kgxref->{$id}->{$annotation_set}}); ! #foreach my $annotation_key (keys %{$annotation->{$id}->{$annotation_set}}){ ! # print "$annotation_key=", join (",", $annotation->{$id}->{$annotation_set}->{$annotation_key}); ! #} ! } ! if(defined $knownlocuslink->{$id}) { print ";locuslink=", $knownlocuslink->{id}; } ! print "\n"; ! # # now write out stuff for protein ! my @protGenBank = keys (%{$kgxref->{$id}->{protein}}); ! my $protGenBank = $protGenBank[0]; ! if(defined ($protGenBank)) { print join ("\t",'.', $SRC,'protein','.','.','.','.',"ID=$protGenBank;Parent=$id"), "\n"; } } ! # print "\n"; ! #print join ("\t","dbxref=".$annotations->{$id}),"\n"; ! #print Dumper($annotations->{$id}); # now handle the CDS entries -- the tricky part is the need to keep # track of phase --- 74,99 ---- # print the transcript ! print join ("\t",$chrom,$SRCDB,'mRNA',$txStart,$txEnd,'.',$strand,'.',"ID=$id;"); ! if(defined($kgxref->{$id})) { ! foreach my $annotation_set (keys %{$kgxref->{$id}}) { ! print "$annotation_set=". join (",", keys %{$kgxref->{$id}->{$annotation_set}}) .';'; ! } ! if(defined $knownlocuslink->{$id}) { ! print "db:locus=", $knownlocuslink->{$id}; ! } ! ! print "\n"; ! ! # now write out stuff for protein ! ! my @protGenBank = keys (%{$kgxref->{$id}->{'db:genbank:protein'}}); ! my $protGenBank = $protGenBank[0]; ! ! if(defined ($protGenBank)) { ! print join ("\t",'.', $SRCDB,'protein','.','.','.','.',"ID=$protGenBank;Parent=$id"), "\n"; ! } } ! # now handle the CDS entries -- the tricky part is the need to keep # track of phase *************** *** 104,139 **** my @exon_ends = map {$_-$ORIGIN} split ',',$exonEnds; ! if ($strand eq '+') { ! for (my $i=0;$i<@exon_starts;$i++) { my $exon_start = $exon_starts[$i] + 1; my $exon_end = $exon_ends[$i]; my ($utr_start,$utr_end,$cds_start,$cds_end); ! if ($exon_start < $cdsStart) { # in a 5' UTR ! $utr_start = $exon_start; ! } elsif ($exon_start > $cdsEnd) { ! $utr_start = $exon_start; } else { ! $cds_start = $exon_start; } ! if ($exon_end < $cdsStart) { ! $utr_end = $exon_end; ! } elsif ($exon_end > $cdsEnd) { ! $utr_end = $exon_end; } else { ! $cds_end = $exon_end; } ! if ($utr_start && !$utr_end) { # half in half out on 5' end ! $utr_end = $cdsStart - 1; ! $cds_start = $cdsStart; ! $cds_end = $exon_end; } ! if ($utr_end && !$utr_start) { # half in half out on 3' end ! $utr_start = $cdsEnd + 1; ! $cds_end = $cdsEnd; ! $cds_start = $exon_start; } --- 102,137 ---- my @exon_ends = map {$_-$ORIGIN} split ',',$exonEnds; ! if($strand eq '+') { ! for(my $i=0;$i<scalar(@exon_starts);$i++) { my $exon_start = $exon_starts[$i] + 1; my $exon_end = $exon_ends[$i]; my ($utr_start,$utr_end,$cds_start,$cds_end); ! if($exon_start < $cdsStart) { # in a 5' UTR ! $utr_start = $exon_start; ! } elsif($exon_start > $cdsEnd) { ! $utr_start = $exon_start; } else { ! $cds_start = $exon_start; } ! if($exon_end < $cdsStart) { ! $utr_end = $exon_end; ! } elsif($exon_end > $cdsEnd) { ! $utr_end = $exon_end; } else { ! $cds_end = $exon_end; } ! if($utr_start && !$utr_end) { # half in half out on 5' end ! $utr_end = $cdsStart - 1; ! $cds_start = $cdsStart; ! $cds_end = $exon_end; } ! if($utr_end && !$utr_start) { # half in half out on 3' end ! $utr_start = $cdsEnd + 1; ! $cds_end = $cdsEnd; ! $cds_start = $exon_start; } *************** *** 141,194 **** die "programmer error, cds_start and no cds_end" unless defined $cds_start == defined $cds_end; ! if (defined $utr_start && $utr_start <= $utr_end && $utr_start < $cdsStart) { ! # print join ("\t",$chrom,$SRC,"5'-UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" ! print join ("\t",$chrom,$SRC,"five_prime_UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } ! if (defined $cds_start && $cds_start <= $cds_end) { ! print join ("\t",$chrom,$SRC,'CDS',$cds_start,$cds_end,'.',$strand,$phase,"Parent=$id"),"\n"; ! $phase = (($cds_end-$cds_start+1-$phase)) % 3; } ! if (defined $utr_start && $utr_start <= $utr_end && $utr_start > $cdsEnd) { ! # print join ("\t",$chrom,$SRC,"3'-UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" ! print join ("\t",$chrom,$SRC,"three_prime_UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } } } ! if ($strand eq '-') { my @lines; ! for (my $i=@exon_starts-1; $i>=0; $i--) { # count backwards my $exon_start = $exon_starts[$i] + 1; my $exon_end = $exon_ends[$i]; my ($utr_start,$utr_end,$cds_start,$cds_end); ! if ($exon_end > $cdsEnd) { # in a 5' UTR ! $utr_end = $exon_end; ! } elsif ($exon_end < $cdsStart) { ! $utr_end = $exon_end; } else { ! $cds_end = $exon_end; } ! if ($exon_start > $cdsEnd) { ! $utr_start = $exon_start; ! } elsif ($exon_start < $cdsStart) { ! $utr_start = $exon_start; } else { ! $cds_start = $exon_start; } ! if ($utr_start && !$utr_end) { # half in half out on 3' end ! $utr_end = $cdsStart - 1; ! $cds_start = $cdsStart; ! $cds_end = $exon_end; } if ($utr_end && !$utr_start) { # half in half out on 5' end ! $utr_start = $cdsEnd + 1; ! $cds_end = $cdsEnd; ! $cds_start = $exon_start; } --- 139,190 ---- die "programmer error, cds_start and no cds_end" unless defined $cds_start == defined $cds_end; ! if(defined $utr_start && $utr_start <= $utr_end && $utr_start < $cdsStart) { ! print join ("\t",$chrom,$SRCDB,"five_prime_UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } ! if(defined $cds_start && $cds_start <= $cds_end) { ! print join ("\t",$chrom,$SRCDB,'CDS',$cds_start,$cds_end,'.',$strand,$phase,"Parent=$id"),"\n"; ! $phase = (($cds_end-$cds_start+1-$phase)) % 3; } ! if(defined $utr_start && $utr_start <= $utr_end && $utr_start > $cdsEnd) { ! print join ("\t",$chrom,$SRCDB,"three_prime_UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } } } ! if($strand eq '-') { my @lines; ! for(my $i=@exon_starts-1; $i>=0; $i--) { # count backwards my $exon_start = $exon_starts[$i] + 1; my $exon_end = $exon_ends[$i]; my ($utr_start,$utr_end,$cds_start,$cds_end); ! if($exon_end > $cdsEnd) { # in a 5' UTR ! $utr_end = $exon_end; ! } elsif($exon_end < $cdsStart) { ! $utr_end = $exon_end; } else { ! $cds_end = $exon_end; } ! if($exon_start > $cdsEnd) { ! $utr_start = $exon_start; ! } elsif($exon_start < $cdsStart) { ! $utr_start = $exon_start; } else { ! $cds_start = $exon_start; } ! if($utr_start && !$utr_end) { # half in half out on 3' end ! $utr_end = $cdsStart - 1; ! $cds_start = $cdsStart; ! $cds_end = $exon_end; } if ($utr_end && !$utr_start) { # half in half out on 5' end ! $utr_start = $cdsEnd + 1; ! $cds_end = $cdsEnd; ! $cds_start = $exon_start; } *************** *** 196,211 **** die "programmer error, cds_start and no cds_end" unless defined $cds_start == defined $cds_end; ! if (defined $utr_start && $utr_start <= $utr_end && $utr_start > $cdsEnd) { ! # unshift @lines,join ("\t",$chrom,$SRC,"5'-UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" ! unshift @lines,join ("\t",$chrom,$SRC,"five_prime_UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } ! if (defined $cds_start && $cds_start <= $cds_end) { ! unshift @lines,join ("\t",$chrom,$SRC,'CDS',$cds_start,$cds_end,'.',$strand,$phase,"Parent=$id"),"\n"; ! $phase = (($cds_end-$cds_start+1-$phase)) % 3; } ! if (defined $utr_start && $utr_start <= $utr_end && $utr_end < $cdsStart) { ! # unshift @lines,join ("\t",$chrom,$SRC,"3'-UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" ! unshift @lines,join ("\t",$chrom,$SRC,"three_prime_UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } --- 192,205 ---- die "programmer error, cds_start and no cds_end" unless defined $cds_start == defined $cds_end; ! if(defined $utr_start && $utr_start <= $utr_end && $utr_start > $cdsEnd) { ! unshift @lines,join ("\t",$chrom,$SRCDB,"five_prime_UTR",,$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } ! if(defined $cds_start && $cds_start <= $cds_end) { ! unshift @lines,join ("\t",$chrom,$SRCDB,'CDS',$cds_start,$cds_end,'.',$strand,$phase,"Parent=$id"),"\n"; ! $phase = (($cds_end-$cds_start+1-$phase)) % 3; } ! if(defined $utr_start && $utr_start <= $utr_end && $utr_end < $cdsStart) { ! unshift @lines,join ("\t",$chrom,$SRCDB,"three_prime_UTR",$utr_start,$utr_end,'.',$strand,'.',"Parent=$id"),"\n" } *************** *** 214,220 **** } } # for protein/mrna printing ! if (defined($kgxref) && (defined($knowngenepep) || defined($knowngenemrna))) { print "##FASTA\n"; # now print all my lovely protein sequence --- 208,215 ---- } } + close(KG) or die "couldn't close('$KNOWNGENE'): $!"; # for protein/mrna printing ! if(defined($kgxref) && (defined($knowngenepep) || defined($knowngenemrna))) { print "##FASTA\n"; # now print all my lovely protein sequence *************** *** 245,255 **** =cut ! sub parseLocAcc{ my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; my @line = split /\t/; $line[1] =~ /(.*)\.\d/; --- 240,250 ---- =cut ! sub parseLocAcc { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; my @line = split /\t/; $line[1] =~ /(.*)\.\d/; *************** *** 275,285 **** =cut ! sub parseKnownLocusLink{ my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; my ($accession,$locuslink) = split /\t/; $annotations->{$accession} = $locuslink; --- 270,280 ---- =cut ! sub parseKnownLocusLink { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; my ($accession,$locuslink) = split /\t/; $annotations->{$accession} = $locuslink; *************** *** 301,311 **** =cut ! sub parseKnownGeneMrna{ my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; my ($accession,$sequence) = split /\t/; $annotations->{$accession} = $sequence; --- 296,306 ---- =cut ! sub parseKnownGeneMrna { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; my ($accession,$sequence) = split /\t/; $annotations->{$accession} = $sequence; *************** *** 328,342 **** =cut ! sub parseKnownGenePep{ my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; my ($accession,$sequence) = split /\t/; - #my @protAcc = keys %{$kgxref->{$accession}->{protAcc}}; - #print @protAcc[0]."\n"; - #$annotations->{@protAcc[0]} = $sequence; my $protGenbankId = $mrna2protein->{$accession}; $annotations->{$protGenbankId} = $sequence; --- 323,334 ---- =cut ! sub parseKnownGenePep { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; my ($accession,$sequence) = split /\t/; my $protGenbankId = $mrna2protein->{$accession}; $annotations->{$protGenbankId} = $sequence; *************** *** 358,368 **** =cut ! sub parseGenbank{ my $file = shift; my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $file or die "Can't open file $file\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; my ($mrna, $prot) = split /\t/; $annotations->{$mrna} = $prot; --- 350,360 ---- =cut ! sub parseGenbank { my $file = shift; my $annotations = {}; # stores the mRNA genbank id as key, protein genbank id as value ! open ANNFILE, $file or die "Can't open file $file: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; my ($mrna, $prot) = split /\t/; $annotations->{$mrna} = $prot; *************** *** 375,378 **** --- 367,371 ---- =head2 parseKgXref + Title : parseKgXref Usage : *************** *** 381,392 **** Returns : Args : =cut sub parseKgXref { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename\n"; while(<ANNFILE>) { chomp; ! next if /^\#/;; # first two are the same (genebank) followed by swissprot etc... my ($kgID, $mRNA, $spID, $spDisplayID, $geneSymbol, $refseq, $protAcc, $description) = split /\t/; --- 374,388 ---- Returns : Args : + + =cut + sub parseKgXref { my $filename = shift; my $annotations = {}; ! open ANNFILE, $filename or die "Can't open file $filename: $!"; while(<ANNFILE>) { chomp; ! next if /^#/; # first two are the same (genebank) followed by swissprot etc... my ($kgID, $mRNA, $spID, $spDisplayID, $geneSymbol, $refseq, $protAcc, $description) = split /\t/; *************** *** 398,411 **** $description = uri_escape($description); ! $annotations->{$key}->{kgID}->{$kgID} = 1 if $kgID; ! $annotations->{$key}->{mRNA}->{$mRNA} = 1 if $mRNA; ! $annotations->{$key}->{spID}->{$spID} = 1 if $spID; ! $annotations->{$key}->{spDisplayID}->{$spDisplayID} = 1 if $spDisplayID; ! $annotations->{$key}->{geneSymbol}->{$geneSymbol} = 1 if $geneSymbol; ! $annotations->{$key}->{refseq}->{$refseq} = 1 if $refseq; ! $annotations->{$key}->{protAcc}->{$protAcc} = 1 if $protAcc; ! $annotations->{$key}->{description}->{$description} = 1 if $description; ! my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! $annotations->{$key}->{protein}->{$protAccession} = 1 if $protAccession; } close ANNFILE; --- 394,408 ---- $description = uri_escape($description); ! my $protAccession = $mrna2protein->{$mRNA}; # pulls out the protein genbank accession ! ! $annotations->{$key}->{'db:genbank:protein'}->{$protAccession} = 1 if $protAccession; ! $annotations->{$key}->{'db:genbank:mrna'}->{$kgID} = 1 if $kgID; ! $annotations->{$key}->{'db:genbank:mrna'}->{$mRNA} = 1 if $mRNA; ! $annotations->{$key}->{'db:swissprot'}->{$spID} = 1 if $spID; ! $annotations->{$key}->{'db:swissprot:display'}->{$spDisplayID} = 1 if $spDisplayID; ! $annotations->{$key}->{'genesymbol'}->{$geneSymbol} = 1 if $geneSymbol; ! $annotations->{$key}->{'db:refseq:mrna'}->{$refseq} = 1 if $refseq; ! $annotations->{$key}->{'db:refseq:protein'}->{$protAcc} = 1 if $protAcc; ! $annotations->{$key}->{'description'}->{$description} = 1 if $description; } close ANNFILE; |
From: <allenday@us...> - 2003-12-19 00:33:01
|
Update of /cvsroot/gmod/schema/chado/load/bin In directory sc8-pr-cvs1:/tmp/cvs-serv19396/load/bin Modified Files: load_gff3.PLS Log Message: removing config cruft Index: load_gff3.PLS =================================================================== RCS file: /cvsroot/gmod/schema/chado/load/bin/load_gff3.PLS,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** load_gff3.PLS 18 Dec 2003 17:07:31 -0000 1.3 --- load_gff3.PLS 19 Dec 2003 00:32:57 -0000 1.4 *************** *** 110,113 **** --- 110,115 ---- committing to the database (default: 1000) + --force Force the file to load, even if it has already been + loaded before =head1 AUTHORS *************** *** 122,126 **** =cut ! my ($ORGANISM, $SRC_DB, $GFFFILE, $CACHE_SIZE); GetOptions('organism:s' => \$ORGANISM, --- 124,128 ---- =cut ! my ($ORGANISM, $SRC_DB, $GFFFILE, $CACHE_SIZE, $FORCE_LOAD); GetOptions('organism:s' => \$ORGANISM, *************** *** 128,131 **** --- 130,134 ---- 'gfffile:s' => \$GFFFILE, 'cache:s' => \$CACHE_SIZE, + 'force' => \$FORCE_LOAD, ) or (system('pod2text',$0), exit -1); *************** *** 235,240 **** my $mtime = (stat($GFFFILE))[9]; my ($pub) = Chado::Pub->search(title => $GFFFILE." ".$mtime); ! if ($pub) { ! print "It appears that you have already loaded this exact file\n"; print "Do you want to continue [no]? "; chomp (my $response = <STDIN>); --- 238,243 ---- my $mtime = (stat($GFFFILE))[9]; my ($pub) = Chado::Pub->search(title => $GFFFILE." ".$mtime); ! if ($pub and !$FORCE_LOAD) { ! print "\nIt appears that you have already loaded this exact file\n"; print "Do you want to continue [no]? "; chomp (my $response = <STDIN>); |