Update of /cvsroot/gmod/schema/GMODTools/lib/Bio/GMOD/Bulkfiles In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13108/lib/Bio/GMOD/Bulkfiles Modified Files: FeatureWriter.pm MyLargePrimarySeq.pm ToFasta.pm ToFormat.pm Added Files: BlastWriter.pm BulkWriter.pm FastaWriter.pm GnomapWriter.pm Log Message: more details worked out; good for dmel, tested w/ sgd chado dbs --- NEW FILE: BlastWriter.pm --- package Bio::GMOD::Bulkfiles::BlastWriter; use strict; =head1 NAME Bio::GMOD::Bulkfiles::BlastWriter =head1 SYNOPSIS use Bio::GMOD::Bulkfiles; my $sequtil= Bio::GMOD::Bulkfiles->new( # was SeqUtil2 configfile => 'seqdump-r4', ); my $fwriter= $sequtil->getBlastWriter(); my $result= $fwriter->makeFiles( infiles => [ @$fastafiles ], # required ); =head1 NOTES genomic sequence file utilities, part3; parts from flybase/work.local/chado_r3_2_26/soft/blastdbupdate.pl =head1 AUTHOR D.G. Gilbert, 2004, gil...@in... =head1 METHODS =cut #----------------- # debug use lib("/bio/biodb/common/perl/lib", "/bio/biodb/common/system-local/perl/lib"); use POSIX; use FileHandle; use File::Spec::Functions qw/ catdir catfile /; use File::Basename; use Bio::GMOD::Bulkfiles::BulkWriter; use vars qw(@ISA); @ISA= (qw/Bio::GMOD::Bulkfiles::BulkWriter/); ## want interface class our $DEBUG = 0; my $VERSION = "1.0"; my $configfile= "blastfiles"; #? BulkFiles/BlastWriter.xml use vars qw/ $formatdb /; sub init { my $self= shift; $self->SUPER::init(); $DEBUG= $self->{debug} if defined $self->{debug}; $self->{configfile}= $configfile unless defined $self->{configfile}; # $self->{failonerror}= 0 unless defined $self->{failonerror}; ## $self->setDefaultValues(); #?? use or not? } =item initData initialize data from config =cut sub initData { my($self)= @_; $self->SUPER::initData(); my $config = $self->{config}; my $blastdir= $self->handler()->getReleaseSubdir( $self->{finfo}->{path} || 'blast/'); $self->{blastdir} = $blastdir; $config->{isprot_patt} ||= '(translation|aa_)'; my $blasthome= $config->{blasthome} ; #|| "$oroot/common/servers/blast/Bin"; $formatdb= $config->{formatdb} || "$blasthome/formatdb"; unless(-e $formatdb) { warn "Missing formatdb: $formatdb"; } my $formatdbopts= $config->{formatdbopts} || '-o F '; # T - True: Parse SeqId and create indexes. # -t Title $config->{formatdbopts}= $formatdbopts; } #-------------- subs ------------- =item makeFiles( %args ) primary method makes blast indices. input file sets are intermediate chado db dump tables. arguments: infiles => \@fileset, # required =cut sub makeFiles { my $self= shift; my %args= @_; print STDERR "BlastWriter::makeFiles\n" if $DEBUG; # debug my $fileset = $args{infiles}; my $chromosomes = $args{chromosomes}; unless(ref $fileset) { my $intype= $self->{config}->{informat} || 'fasta'; #? maybe array $fileset = $self->handler->getFiles($intype, $chromosomes); # warn "makeFiles: no infiles => \@filesets given"; return; } my @seqfiles= $self->openInput( $fileset ); my $res= $self->processBlastInput( \@seqfiles); print STDERR "BlastWriter::makeFiles: done\n" if $DEBUG; return 1; #what? } =item openInput( $fileset ) handle input files =cut sub openInput { my $self= shift; my( $fileset )= @_; # do per-csome/name my @files= (); my $inh= undef; return undef unless(ref $fileset); my $intype = $self->{config}->{informat} || 'fasta'; #? maybe array my $featset= $self->{config}->{blastset} || []; my @featset= @$featset; print STDERR "openInput: type=$intype blastset=",join(",",@featset),"\n" if $DEBUG; foreach my $fs (@$fileset) { my $fp= $fs->{path}; my $name= $fs->{name}; my $type= $fs->{type}; # want also/instead featset type here ? gene,mrna,cds,... # next unless( $type =~ /$intype/); # could it be 'dna/fasta', 'amino/fasta' ? my ($featn,$format)= split /[\/]/, $type; my $ok= ( $type =~ /$intype/) ; $ok = grep({$_ eq $featn} @featset) if ($ok && @featset); # NO: do below; check for 'all' chr, skip or use instead if there ?? -- need some option #### $ok= 0 if ($fs->{chr} eq 'all'); # || $name =~ m/_all_/ print STDERR "$name $featn, $type, ok=$ok\n" if $DEBUG; next unless $ok; ##( !@featset || grep /^$featn/, @featset); unless(-e $fp) { warn "missing intype file $fp"; next; } push(@files, $fp); #?? return full $fs struct w/ $featn ? } return @files; } =item get_filename( $org, $chr, $featn, $rel, $format) make standard output file name "${org}_${chr}_${featn}_${rel}.${format}" =cut sub get_filename { return shift->blastname(@_); ## {sequtil}->get_filename( @_); } # sub split_filename # { # return shift->handler()->split_filename( @_); # } sub blastname { my $self= shift; my($dbname)= @_; if ($dbname =~ s/(\d[\d\.]*\.\d)/ZYX/) { my $r=$1; $r=~s/\.//g; $dbname =~ s/ZYX/$r/; } # squeeze r3.2.1 to r321 $dbname =~ s/\..*$//; # drop .fasta.gz etc. return $dbname; } =item processBlastInput =cut sub processBlastInput { my $self= shift; my( $rseqfiles )= @_; my $blastdir= $self->{blastdir}; my ($doformat, $doconfig)= (1,1); my $ndone= 0; # format only if changed... $self->updateformat( $blastdir, $rseqfiles) if ($doformat); my @blastfiles=(); opendir(D, $blastdir); @blastfiles= grep(/^\w/,readdir(D)); closedir(D); if ($doconfig) { my %dbinfo= $self->getDbDirInfoByTitle( $blastdir, \@blastfiles); $self->update_blastrc( $blastdir, \@blastfiles); $self->update_dbselect( \%dbinfo); $self->update_dbhtml( \%dbinfo); } $ndone= scalar( @blastfiles); print STDERR "processBlastInput ndone = $ndone\n" if $DEBUG; return $ndone; } #------------- # parts from flybase/work.local/chado_r3_2_26/soft/blastdbupdate.pl #------------- sub update_blastrc { my $self= shift; my($path, $rdir)= @_; my $rcdoc= $self->{config}->{doc}->{dbrc}; my @dir= @$rdir; warn "update_blastrc()\n" if $DEBUG; my $nalist= join " ", map{ $self->blastname($_) } grep(/\.(nhr|nal)$/,@dir); my $aalist= join " ", map{ $self->blastname($_) } grep(/\.(phr|pal)$/,@dir); my @rc= split "\n",$rcdoc->{content}; push(@rc, qw/blastn blastp blastx tblastn tblastx/) unless(@rc); foreach (@rc) { s/^blastn.*$/blastn $nalist/; s/^tblastn.*$/tblastn $nalist/; s/^tblastx.*$/tblastx $nalist/; s/^blastp.*$/blastp $aalist/; s/^blastx.*$/blastx $aalist/; } $rcdoc->{content}= join("\n", @rc); $self->handler()->writeDocs( { dbrc => $rcdoc }); } sub getDbInfo { my $self= shift; my ( $dbhash, $path, $blastfile)= @_; my $blname = $self->blastname($blastfile); my ( $org, $chr, $featn, $rel, $format)= $self->split_filename($blname); my $db= $dbhash->{$featn}; # probably this unless($db) { $db= $dbhash->{$blname}; } unless($db) { (my $bl2= $blname) =~s/_r\d+.*//; $db= $dbhash->{$bl2}; } # w/o version unless($db) { my $isprot_patt = $self->config->{isprot_patt}; my $doprot= ($blname =~ m/$isprot_patt/) ? 1 : 0; my $tt= (($chr eq 'all') ? "All" : $chr) . " $org $featn " . (($doprot) ? "(AA)" : "(NT)"); $db= { name => $blname, title => $tt, content => "", }; } my $ftime= $^T - 24*60*60*(-M $blastfile); $db->{date}= POSIX::strftime("%d-%b-%Y", localtime( $ftime )); $db->{blname}= $blname; $db->{files} = $blname; my ($files); ## FIXME need to write .nal, .pal from config? if (-r "$path/$blname.nal") { open(BL,"$blname.nal"); ($files)=grep(/DBLIST/,<BL>); close(BL);} elsif (-r "$path/$blname.pal") { open(BL,"$blname.pal"); ($files)=grep(/DBLIST/,<BL>); close(BL);} if ($files) { $files =~ s/\s*DBLIST\s*//; $db->{files}= $files; } return $db; } sub getDbDirInfoByTitle { my $self= shift; my($path, $dirlist)= @_; my $dbhash= $self->{config}->{db}; my @blf= grep(/\.(nhr|phr|nal|pal)$/, @$dirlist); my %dbh=(); foreach my $blf (@blf) { my $db= $self->getDbInfo($dbhash, $path, $blf); next if ($db->{skip}); my $title= $db->{title}; $dbh{$title}= $db; } return %dbh; } sub update_dbhtml { my $self= shift; my($dbh)= @_; warn "update_dbhtml\n" if $DEBUG; my $dbtable= $self->{config}->{doc}->{dbtable}; my $content= $dbtable->{header}->{content} || '<html><body><table> '; $content .= $dbtable->{tableheader}->{content} || "<tr bgcolor='#a0a0a0'> <td color='white'>Pull-down Menu</td> <td color='white'>Database file</td> <td color='white'>Update</td> <td color='white'>Description</td> </tr>\n"; foreach my $title (sort keys %$dbh) { my $db = $dbh->{$title}; $content .= "<tr> <th>$title</th> <th>$db->{files}</th> <th>$db->{date}</th> <th>$db->{content}</th> </tr>\n"; } $content .= $dbtable->{footer}->{content} || ' </table></body></html>'; $dbtable->{content}= $content; $self->handler()->writeDocs( { dbtable => $dbtable }); } sub update_dbselect { my $self= shift; my($dbh)= @_; warn "update_dbselect \n" if $DEBUG; my $dbselect= $self->{config}->{doc}->{dbselect}; my $content= $dbselect->{header}->{content} || ''; $content .= "<select name = \"DATALIB\">\n"; foreach my $title (sort keys %$dbh) { my $db = $dbh->{$title}; $content .= " <option VALUE = \"$db->{blname}\"> $title\n"; } $content .= "</select>\n"; $content .= $dbselect->{footer}->{content} || ''; $dbselect->{content}= $content; $self->handler()->writeDocs( { dbselect => $dbselect }); } sub updateformat { my $self= shift; # my($path, $rdir)= @_; my ( $blastdir, $datafiles)= @_; my $isprot_patt = $self->config->{isprot_patt}; my @fastafiles= @$datafiles; ##grep(/\.(gz|Z|fa|fasta)$/, @$datafiles); #? or assume all @files are fasta? my %dbset= (); warn "update formatdb \n" if $DEBUG; ## FIXME: input fasta == $org_$chr_$feature_$release ## >> cat all $org_{1..n}_feature into one blastdb my %alldata=(); foreach my $fa (@fastafiles) { my ( $org, $chr, $featn, $rel, $format)= $self->split_filename($fa); ## keep release in blast db name ## check here if have 'all' $chr input, if so skip any other $chr in same set my $blname= $self->handler->get_filename($org,'all',$featn,$rel,$format); $blname= $self->blastname($blname); $alldata{$blname}= $fa if ($chr eq 'all'); ##my ($sfile, $spath, $ext) = File::Basename::fileparse($fa, '\.[^\.]+'); ##my $blname = $self->blastname($sfile); my $blf = $blname . '.nhr'; unless (-e "$blastdir/$blf") { $blf= $blname . '.phr'; } # $_='' unless ($self->isold( $_, "$blastdir/$blf")); # skip current indices if ($self->isold( $fa, "$blastdir/$blf")) { unless($dbset{$blname}) { $dbset{$blname}= []; } push( @{$dbset{$blname}}, $fa); } } foreach my $blname (keys %dbset) { my $doprot= ($blname =~ m/$isprot_patt/) ? 1 : 0; my $seqlist= $dbset{$blname}; if ($alldata{$blname}) { $seqlist= [ $alldata{$blname} ]; } $self->formatdb_list( $seqlist, $blastdir, $blname, $doprot); # may chdir } # foreach my $db (@dbs) { # next unless($db && -e $db); # my $doprot= ($db =~ m/$isprot_patt/) ? 1 : 0; # $self->formatdb($db, $blastdir, $doprot); # may chdir # } } sub formatdb_list { my $self= shift; my( $seqlist, $blastdir, $blastname, $doprot)= @_; warn "formatdb( $blastname )\n" if $DEBUG; my $opts= $self->{config}->{formatdbopts}; ##$formatdbopts; if ($doprot) { $opts .= ' -p T ';} else { $opts .= ' -p F '; } warn("#$blastname: cat ",join(" ",@$seqlist)," | $formatdb $opts -i stdin \n") if $DEBUG; my $olddir= $ENV{'PWD'}; #?? not safe? chdir($blastdir); # need for formatdb - no good -out option foreach (@$seqlist) { $_ = catfile($olddir,$_) unless($_ =~ m,^/,); } my $seqlib = join(" ",@$seqlist); my $cat= ($seqlib =~ /\.(gz|Z)/) ? 'gunzip -c' : 'cat'; system("$cat $seqlib | $formatdb $opts -i stdin "); opendir(D,"."); my @f= grep(/stdin/,readdir(D)); closedir(D); foreach my $f (@f) { (my $t= $f) =~ s/stdin/$blastname/; rename($f,$t); } chdir($olddir); } sub formatdb { my $self= shift; my($seqlib, $blastdir, $doprot)= @_; warn "formatdb($seqlib)\n" if $DEBUG; my $opts= $self->{config}->{formatdbopts}; ##$formatdbopts; if ($doprot) { $opts .= ' -p T ';} else { $opts .= ' -p F '; } my ($sfile, $spath, $ext) = File::Basename::fileparse($seqlib, '\.[^\.]+'); my $blastname = $self->blastname($sfile); my $olddir= $ENV{'PWD'}; #??? chdir($blastdir); # need for formatdb - no good -out option my $cat= ($seqlib =~ /\.(gz|Z)$/) ? 'gunzip -c' : 'cat'; $seqlib = catfile($olddir,$seqlib) unless($seqlib =~ m,^/,); warn("$cat $seqlib | $formatdb $opts -i stdin \n") if $DEBUG; system("$cat $seqlib | $formatdb $opts -i stdin "); opendir(D,"."); my @f= grep(/stdin/,readdir(D)); closedir(D); foreach my $f (@f) { (my $t= $f) =~ s/stdin/$blastname/; rename($f,$t); } chdir($olddir); } 1; __END__ Index: MyLargePrimarySeq.pm =================================================================== RCS file: /cvsroot/gmod/schema/GMODTools/lib/Bio/GMOD/Bulkfiles/MyLargePrimarySeq.pm,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** MyLargePrimarySeq.pm 23 Aug 2004 06:44:41 -0000 1.1 --- MyLargePrimarySeq.pm 30 Aug 2004 04:51:28 -0000 1.2 *************** *** 16,22 **** use vars qw(@ISA); use Bio::Seq::LargePrimarySeq; ! BEGIN{ ! @ISA = qw(Bio::Seq::LargePrimarySeq); ! } sub new { --- 16,20 ---- use vars qw(@ISA); use Bio::Seq::LargePrimarySeq; ! BEGIN{ @ISA = qw(Bio::Seq::LargePrimarySeq); } sub new { --- NEW FILE: GnomapWriter.pm --- package Bio::GMOD::Bulkfiles::GnomapWriter; use strict; =head1 NAME Bio::GMOD::Bulkfiles::GnomapWriter =head1 SYNOPSIS use Bio::GMOD::Bulkfiles; my $sequtil= Bio::GMOD::Bulkfiles->new( configfile => 'seqdump-r4', ); my $fwriter= $sequtil->getGnomapWriter(); my $result = $fwriter->makeFiles( ); =head1 NOTES This is FlyBase-specific for now ; handles fly cytology map features and gnomap feature indexing genomic sequence file utilities, part3; parts from flybase/work.local/chado_r3_2_26/soft/mergeflyfeats4.pl =head1 AUTHOR D.G. Gilbert, 2004, gil...@in... =head1 METHODS =cut #----------------- # debug use lib("/bio/biodb/common/perl/lib", "/bio/biodb/common/system-local/perl/lib"); use POSIX; use FileHandle; use File::Spec::Functions qw/ catdir catfile /; use File::Basename; use Bio::GMOD::Bulkfiles::BulkWriter; use vars qw(@ISA); @ISA= (qw/Bio::GMOD::Bulkfiles::BulkWriter/); ## want interface class use constant RECSIZE => length(pack("NN", 1, 50000)); our $DEBUG = 0; my $VERSION = "1.0"; my $configfile= "tognomap"; #? BulkFiles/GnomapWriter.xml my $kMissingValue= -99999999; my $kMaxValue= 999999999; my $kMinValue= $kMissingValue+1; use vars qw/ %flycsomebands $noIDmap $indexidtype $indexidpattern /; sub init { my $self= shift; $self->SUPER::init(); $DEBUG= $self->{debug} if defined $self->{debug}; $self->{configfile}= $configfile unless defined $self->{configfile}; } =item initData initialize data from config =cut sub initData { my($self)= @_; $self->SUPER::initData(); my $config = $self->{config}; ## gnomapfiles my $finfo= $self->{fileinfo} || $self->handler()->getFilesetInfo('gnomap'); my $dnainfo= $self->handler()->getFilesetInfo('dna'); $self->{indexonly} = $finfo->{indexonly}; $noIDmap = $finfo->{noidmap} || $config->{noidmap} || 'cytowalk|protein|mRNA|CDS|EST|cDNA|oligo|processed|repeat|sim4'; $indexidtype= $finfo->{indexidtype} || $config->{indexidtype} || '^(gene|pseudogene|\w+RNA)'; $indexidpattern= $finfo->{indexidpattern} || $config->{indexidpattern} || '[A-Z]{2}gn\d+'; %flycsomebands = ( 'X' => [ '1A1','20F4'], '2L' => ['21A1','40F7'], '2R' => ['41A1','60F5'], '3L' => ['61A1','80F9'], '3R' => ['81F1','100F5'], '4' => ['101A1','102F8'], ); my $gnomapdir= $self->handler()->getReleaseSubdir( $finfo->{path} || 'gnomap/'); $self->{gnomapdir} = $config->{gnomapdir}= $gnomapdir; my $reldir= $self->handler()->getReleaseDir(); my $dnareldir= $self->handler()->getReleaseSubdir( $dnainfo->{path} || 'dna/'); $dnareldir =~ s,^$reldir,,; $dnareldir = "../$dnareldir"; $self->{dnareldir}= $dnareldir; my $sumfile= catfile( $reldir, "feature_map-summary.txt"); # or $finfo->{summaryfile} || $config->{summaryfile} $self->{summaryfile} = $sumfile; ## dont create subdir - use only if exists .. my $cytomapdir= $self->handler()->getReleaseSubdir( $config->{cytofeat}->{path} || 'cytomap/', 'nocreate'); ## use <fileset name=cytomap ?? $self->{cytomapdir} = $cytomapdir; my $sorsa= $config->{sorsa}->{path}; if ($sorsa) { $sorsa= catfile( $self->handler()->getReleaseDir(), $sorsa); $self->{sorsatable} = $sorsa; } } #-------------- subs ------------- =item notes for simple, non-dmel-flybase case of no cytofeats to add, need only here create gnomap/ folder files w/ indexed fff (stripped of leading chr, bstart columns flybase cytofeatures look like (fff format) 2L 1 cytogene anon-21Aa 21A -30856..353 FBgn0015861 2L 1 cytogene l(2)21Bb 21A1-21B4 -30856..213625 FBgn0001885 X 22039275 cytoduplicated_segment Ts(1Lt;YSt)E15+Ts(YLt;3Lt)W27 20F-h29;91A-100F5 22039275..22098704 FBab0025172 X 5216754 cytoduplicated_segment Dp(1;2;1)AT 5A-7A;36D1-37D2 5216754..6912751 FBab0003372 =cut =item makeFiles( %args ) primary method makes blast indices. input file sets are intermediate chado db dump tables. arguments: infiles => \@fileset, # required =cut sub makeFiles { my $self= shift; my %args= @_; print STDERR "makeFiles\n" if $DEBUG; # debug my $fileset = $args{infiles}; my $chromosomes = $args{chromosomes}; unless(ref $fileset) { my $intype= $self->{config}->{informat} || ['fff','dna']; #? maybe array $fileset = $self->handler->getFiles($intype, $chromosomes); # warn "makeFiles: no infiles => \@filesets given"; return; } ## infiles => [ @$featfiles, @$dnafiles ] ## $self->readflysorsa(); #? here or wait if ($self->{indexonly}) { warn "Indexing only features in $self->{gnomapdir}\n" if $DEBUG; $self->indexfeatdir($self->{gnomapdir}); return 1; } ## ---------- get cytology set ---------- # == cytomap/ folder # need to get seq id list and purge cytogene/.. w/ same id ## my ($nr, $nk)= cytosort( $fset_h, $dir.$hf, $csome, *FO); my $addcyto= 0; if (-d $self->{cytomapdir}) { # $config->{cytofeat}->{path} OR fileset->cytomap my $cytoset= $self->handler->getFiles('cytomap', $chromosomes); if ($cytoset) { $fileset= [ @$fileset, @$cytoset ]; $addcyto=1; } #? add to $fileset # require cytomapdir to have tables in fffformat==2, split by chr # then merge steps are # 1. readids (seqset) (see fastawriter) # 2. filter cytomap fff by seq ids # 3. cat seq,cyto fff | sort | write to gnomapdir # $addcyto= 1; } ## ---------- get sequence set ---------- # == fff/ folder # 1. symlink dnafiles to gnomap/dna-$chr.raw # 2. copy fff/release-$chr.fff to gnomap/features-$chr.tsv , stripping lead 2 cols $self->addDnaSymlinks($fileset); if($addcyto) { $self->mergeFeats($fileset); } else { $self->copyFFF2Gnomap($fileset); } $self->printSummary( $self->{summaryfile}, $self->{featnames}); $self->indexfeatdir($self->{gnomapdir}) ; #if ($doindex); my @featnames= (); if ($self->{featnames}->{all}) { @featnames= sort keys %{$self->{featnames}->{all}}; } $self->makeGbrowseConf(\@featnames); # should pass array of feature names ! print STDERR "GnomapWriter::makeFiles: done\n" if $DEBUG; return 1; #what? } sub mergeFeats { my $self= shift; my( $fileset )= @_; # do per-csome/name my $ok= 1; my $filterids= 1; #?? config my $gnomapdir= $self->{gnomapdir}; my $sorter=`which sort`; chomp($sorter); ## '/bin/sort'; '/usr/bin/sort'; for (my $ipart= 0; $ok; $ipart++) { $ok= 0; my $infile= $self->openInput( $fileset, $ipart, 'fff'); if ($infile && $infile->{inh}) { my $chr= $infile->{chr}; if ($filterids) { my $inh= $infile->{inh}; my $idlist= $self->readIdsFromFFF( $inh, $chr, $self->handler()->{config}); # for featmap ? $self->{idlist}= $idlist; ##$inh= $self->resetInput($infile); } # require cytomapdir to have tables in fffformat==2, split by chr # then merge steps are # 1. readids (seqset) (see fastawriter) # 2. filter cytomap fff by seq ids # 3. cat seq,cyto fff | sort | write to gnomapdir ## this not good - need to make sure same $chr as $infile my $inmerge; my $mergepipe; foreach my $fs (@$fileset) { my $fp = $fs->{path}; next unless($fs->{type} =~ /cytomap/); next unless($fs->{chr} eq $chr); $inmerge = $fs; last; } if ($inmerge) { close($infile->{inh}) if ($infile->{inh}); # if .gz ?? my $catset= $infile->{path} ." ".$inmerge->{path}; if ($catset =~ /\.gz/) { $catset= "gunzip -c ".$catset; } else { $catset= "cat ".$catset; } $catset .= " | $sorter -t' ' -k 1,1 -k 2,2n |"; #NOTE TAB in ' ' open(MERGE,$catset); $mergepipe= *MERGE; } else { # just cat infile $mergepipe= $self->resetInput($infile); } my $outname= catfile($gnomapdir,"features-$chr.tsv"); my $outh= new FileHandle(">$outname"); print STDERR "merge $outname from $infile->{path}, $inmerge->{path}\n" if $DEBUG; my $res= $self->merge2gnomap( $mergepipe, $outh, $chr); close($outh); close($mergepipe); $ok= 1; } } ## $self->printSummary( $self->{summaryfile}, $self->{featnames}); } sub merge2gnomap { my $self= shift; my( $inh, $outh, $chr )= @_; # do per-csome/name # my $csomefeats= $self->{featnames}; my $ffformat = 0; #? test always; probably is 2 while(<$inh>){ if (/^\w/ && /\t/) { my @v= split "\t", $_; if ( $ffformat == 2 || @v > 7 || ($v[0] =~ /^\w/ && $v[1] =~ /^[\d-]+$/)) { $ffformat= 2; splice(@v,0,2); } print $outh join("\t",@v); my $fname= $v[0]; $self->{featnames}->{all}->{$fname}++; # save for gbrowse... $self->{featnames}->{$chr}->{$fname}++; # save for gbrowse... } else { print OUT $_; } } ##$self->printSummary( $self->{summaryfile}, $self->{featnames}); } sub addDnaSymlinks { my $self= shift; my( $fileset )= @_; # do per-csome/name my $intype = 'dna/raw'; my $gnomapdir= $self->{gnomapdir}; my $dnareldir= $self->{dnareldir}; foreach my $fs (@$fileset) { my $fp= $fs->{path}; my $name= $fs->{name}; my $type= $fs->{type}; my $chr= $fs->{chr}; next unless( $fs->{type} eq $intype); ## unless(-e $fp) { warn "missing intype file $fp"; next; } my($filename, $dir) = File::Basename::fileparse($fp); my $relpath= catfile($dnareldir, $filename); #? my $symname= catfile($gnomapdir,"dna-$chr.raw"); symlink( $relpath, $symname); print STDERR "symlink dna-$chr.raw -> $relpath\n" if $DEBUG; } } sub copyFFF2Gnomap { my $self= shift; my( $fileset )= @_; # do per-csome/name my $intype = 'fff'; my $gnomapdir= $self->{gnomapdir}; # my %csomefeats=(); $self->{featnames}= {}; foreach my $fs (@$fileset) { my $fp= $fs->{path}; my $name= $fs->{name}; my $type= $fs->{type}; my $chr= $fs->{chr}; next unless( $fs->{type} =~ /$intype/); unless(-e $fp) { warn "missing $intype file $fp"; next; } my $featname= catfile($gnomapdir,"features-$chr.tsv"); print STDERR "copy $featname from $fp\n" if $DEBUG; #?? include here opt to merge cyto feats - w/ sort if ($fp =~ m/\.(gz|Z)$/) { open(FF,"gunzip -c $fp|"); } else { open(FF,"$fp"); } my $ffformat = 0; #? test always; probably is 2 open(OUT,">$featname"); while(<FF>){ if (/^\w/ && /\t/) { my @v= split "\t", $_; if ( $ffformat == 2 || @v > 7 || ($v[0] =~ /^\w/ && $v[1] =~ /^[\d-]+$/)) { $ffformat= 2; splice(@v,0,2); } print OUT join("\t",@v); my $fname= $v[0]; $self->{featnames}->{all}->{$fname}++; # save for summary && gbrowse... $self->{featnames}->{$chr}->{$fname}++; } else { print OUT $_; } } close(FF); close(OUT); } ## $self->printSummary( $self->{summaryfile}, $self->{featnames}); } sub printSummary { my $self= shift; my( $sumfile, $csomefeats )= @_; if ( $sumfile && $csomefeats ) { my $fh= new FileHandle(">$sumfile"); my $title = $self->{config}->{title}; my $date = $self->{config}->{date}; my $org = $self->{config}->{species} || $self->{config}->{org}; print $fh "# Genome feature summary of $org from $title [$date]\n"; my @fl= grep { 'all' ne $_ } sort keys %$csomefeats; foreach my $arm ('all', @fl) { print $fh (($arm eq 'all') ? "\n# ALL chromosomes\n" : "\n# Chromosome $arm\n"); foreach my $t (sort keys %{$csomefeats->{$arm}}) { my $v= $csomefeats->{$arm}{$t}; print $fh "$t\t$v\n"; } print $fh "#","="x50,"\n"; } close($fh); } } sub makeGbrowseConf { my $self= shift; my($featnames)= @_; warn "makeGbrowseConf\n" if $DEBUG; ## need active feature set from ? feature-summary.txt or fff/ files my $config={}; # stuff with $self->handler->config && others $config= $self->handler->{config}; ##my $gbrowseconf= $config->{gbrowsefiles}->{config}; ##my $gbrowseconf= $self->{gbrowseconf}; my $gbset= $self->handler->getFilesetInfo('gbrowse'); my $gbrowseconf= $gbset->{config}; # add vars to config # description = ${species} ${relfull} ${date} OR ${title} ?? # datapath = ${gnomapdir} $config->{gnomapdir}= $self->{gnomapdir}; my ($loc, $ex)= ('',''); my $chromosomes= $self->handler->getChromosomes(); foreach my $chr (@$chromosomes) { $loc= "$chr:1..100000" unless($loc); $ex .= "$chr: "; } $config->{ default_location } = $loc; $config->{ examples } = $ex; my $config2= $self->handler->{config2}; my $gbconf= $config2->readConfig( $gbrowseconf, { Variables => $config, debug => $DEBUG, }, {} ); print STDERR $config2->showConfig( $gbconf, { debug => $DEBUG }) if ($self->{showconfig}); my $doc = $gbconf->{doc}->{gbrowse}; my $fdefs= $gbconf->{fdef}; my $content= $doc->{header}->{content} || ''; my @featnames=(); @featnames= @$featnames if (ref $featnames); @featnames= sort keys %$fdefs unless(@featnames); foreach my $fname (@featnames) { my $fd = $fdefs->{$fname}; unless( $fd ) { # next; # check all hash {feature} strings for match... $fd = $fdefs->{GENERIC}; my $gct= $fd->{content}; $gct =~ s/GENERIC/$fname/sg; $fd= { name => $fname, content => $gct }; } next if ($fd->{done}); my $morefeats= $fd->{feature}; my $ct= $fd->{content}; $content .= $ct."\n"; $fd->{done}=1; } $content .= $doc->{footer}->{content} || ''; $doc->{content}= $content; $doc->{path}= $gbset->{path}; #$config->{gbrowsefiles}->{path}; #?? $self->handler()->writeDocs( { gbrowseconf => $doc }); } # =item openInput( $fileset ) # # handle input files # # =cut # # sub openInput # { # my $self= shift; # my( $fileset )= @_; # do per-csome/name # my @files= (); # my $inh= undef; # return undef unless(ref $fileset); # # my $intype = $self->{config}->{informat} || 'fff'; #? maybe array # my $featset= $self->{config}->{featset} || []; # # print STDERR "openInput: type=$intype \n" if $DEBUG; # # foreach my $fs (@$fileset) { # my $fp= $fs->{path}; # my $name= $fs->{name}; # my $type= $fs->{type}; # want also/instead featset type here ? gene,mrna,cds,... # next unless( $fs->{type} =~ /$intype/); # could it be 'dna/fasta', 'amino/fasta' ? # unless(-e $fp) { warn "missing intype file $fp"; next; } # # push(@files, $fp); # } # # return @files; # } =item processToOutput =cut sub processToOutput { my $self= shift; my( $rseqfiles )= @_; } #============== # mostly from flybase/work.local/chado_r3_2_26/soft/mergeflyfeats4.pl #============== ## --- read table of genome:cytology mapping ----- # my $sorsaf="${sorsapath}sorsa.txt"; # if (defined $mconf->{sorsa}->{path}) { # $sorsaf= join("",getFiles( $mconf->{sorsa}->{path} )); # } sub getFiles { my $self= shift; my($path)= @_; my $file; if ($path =~ s,([^/]+)$,,) { $file= $1; } else { $file= $path; $path="./"; } opendir(D,$path); my @file= grep(/$file/,readdir(D)); closedir(D); return $path,@file; } sub readflysorsa { my $self= shift; my ($sorsa)= @_; my %cytobases= %{$self->{cytobases}} || (); return if (scalar(%cytobases)); local(*F,*O); my ($n); my @sorsalist= (); $sorsa= $self->{sorsatable} unless(-e $sorsa); unless (open(F,$sorsa)) { warn "Can't read $sorsa" ; $cytobases{1}= [ 0, 0 ]; # don't come here back again $self->{cytobases}= \%cytobases; return; } warn "reading $sorsa\n" if $DEBUG; while (<F>) { next unless(/^\d/); chomp(); my($cyto,$bb,$be)= split(); ### /\t/ ## (' ') was, now \t separated! $cytobases{$cyto}= [ $bb, $be ]; push( @sorsalist, $cyto); # sorted ! } close(F); $self->{cytobases}= \%cytobases; $self->{sorsalist}= \@sorsalist; } =item replace static sorsa.txt with chromosome_band features from chado db -- gff table 2L . chromosome_band -30855 353 . + . ID=1273798;Name=ba nd-21A1;cyto_range=21A1 2L . chromosome_band -30855 108823 . + . ID=1273801;Name=ba nd-21A;cyto_range=21A 2L . chromosome_band -30855 1318131 . + . ID=1242194;Name=ba nd-21;cyto_range=21 2L . chromosome_band 354 32349 . + . ID=1273804;Name=ba -- use .fff file instead ? 2L -30855 chromosome_band band-21A1 21A1 -30855..353 - 2L -30855 chromosome_band band-21A 21A -30855..108823 - 2L -30855 chromosome_band band-21 21 -30855..1318131 - =cut sub readChadoCytomap { my $self= shift; my( $fset_h, $file, $outh)= @_; local(*F); return "Can't read $file" unless (open(F,$file)); warn "Reading chromosome_band $file \n" if $DEBUG; unless($file =~ m/\.gff$/) { warn "Wrong format - want .gff"; return; } my %cytobases= %{$self->{cytobases}} || (); my @sorsalist= (); ##my @keepset= ($fset_h->{keep}) ? @ {$fset_h->{keep}} : (); ##my @dropset= ($fset_h->{drop}) ? @ {$fset_h->{drop}} : (); while (<F>) { next unless(/^(\w+)\s+(\S+)\s+chromosome_band/); my @gff= split; #next unless (grep {$gff[2] eq $_} @keepset); if ($gff[-1] =~ m/cyto_range=(\S+)/) { my ($cyto,$bb,$be)= ($1, $gff[3], $gff[4]); next unless($cyto =~ m/^\d+[A-F]\d/); # need to drop 1,1A for 1A1 $bb--; # dang interbase -1 doesn't apply to chromosome_band - why not? $cytobases{$cyto}= [ $bb, $be ]; push( @sorsalist, $cyto); # sorted print $outh "$cyto\t$bb\t$be\n" if ($outh); # ! not sorted right here - need chr order } } close(F); $self->{cytobases}= \%cytobases; $self->{sorsalist}= \@sorsalist; } sub getCytobases { my $self= shift; my ($cmap)= @_; my %cytobases= %{$self->{cytobases}} || (); my ($start1,$stop1)= ($kMissingValue,$kMissingValue); ($start1,$stop1)= @ {$cytobases{$cmap}} if $cytobases{$cmap}; return ($start1,$stop1); } sub getCytolocFromSeqloc { my $self= shift; my ($arm, $bstart, $bend)= @_; # $chr my ($cstart, $cend); my @sorsalist= @ { $self->{sorsalist} }; my $ca= $flycsomebands{$arm}->[0]; my $ina= 0; foreach my $cb (@sorsalist) { if ($cb eq $ca) { $ina= 1; } if ($ina) { my ($bb, $be)= $self->getCytobases($cb); # @ {$cytobases{$cb}}; if ($bstart >= $bb && $bstart <= $be) { $cstart= $cb; } if ($bend >= $bb && $bend <= $be) { $cend= $cb; last; } } } if ($cstart && !$cend) { $cend= $cstart; } elsif ($cend && !$cstart) { $cstart= $cend; } if (wantarray) { return ($cstart, $cend); } else { return ($cstart eq $cend) ? $cstart : "$cstart--$cend"; } } sub maxrange { my $self= shift; my( $range)= @_; my ($pre, $suf,$start,$stop, $b, $u); $start= $kMissingValue; $stop= $start; $range =~ s/^([^\d<>-]*)//; $pre= $1; $range =~ s/(\D*)$//; $suf= $1; if ($range =~ m/^([<>]*)([\d-]+)/) { $u= $1; $start= $2; $start-- if ($u eq '<'); } if ($range =~ m/([<>]*)([\d-]+)$/) { $u= $1; $stop= $2; $stop++ if ($u eq '>'); } return ($start,$stop); } sub getCytorange { my $self= shift; my($ca,$cx)= @_; $ca =~ s/^\s*//; $ca =~ s/[\s+-]*$//; return () unless ($ca =~ /^\d/); #? don't have conversion info for hXXX my $offs= 0; ## need patch for 1Lt; 1Rt; 1Cen? h, ... if ($ca !~ /[A-G]/) { if ($cx) { $cx =~ s/\d*$//; $ca= $cx . $ca; } else { $ca .= 'A1'; $offs= -1; } } elsif ($ca !~ /\d$/) { $ca .= '1'; $offs= -1; } my ($start1,$stop1)= $self->getCytobases($ca); #@ {$cytobases{$ca}}; return ($start1,$stop1,$ca); # +$offs } # sub getMap2Bases { return flyCytomap2Bases(@_); } ## ignore ranges outside of csome arm ## same as sub flyCytomap2Bases() sub getMap2Bases { my $self= shift; my ($map, $arm)= @_; my($stop,$start)= ($kMissingValue,$kMissingValue); $self->readflysorsa(); #? here or wait my $carm= $flycsomebands{$arm}->[0]; # $cytoarms{$arm}; my $darm= ($carm =~ m/^(\d+)/) ? $1 : 0; my ($armb, $x)= $self->getCytobases($carm); #@ {$cytobases{$carm}}; my $carme= $flycsomebands{$arm}->[1]; my $darme= ($carme =~ m/^(\d+)/) ? $1 : 0; my ($y, $arme)= $self->getCytobases($carme); #@ {$cytobases{$carme}}; $map =~ s/\s+//g; foreach my $mp (split(/;/, $map)) { next if ($mp eq '*'); next if ($mp =~ /^h/); # cant handle these yet my($ca, $cb)= split(/-/, $mp); my $da= ($ca =~ m/^(\d+)/) ? $1 : 0; next if ($da < $darm || $da > $darme); my($start1,$stop1,$bstart,$bstop); ($start1,$stop1,$ca)= $self->getCytorange($ca); next unless(defined $start1); # next unless($stop1 >= $armb && $start1 <= $arme); if ($cb) { ($bstart,$bstop,$cb)= $self->$self->getCytorange($cb,$ca); $stop1= $bstop if ($bstop); } ##? skip/ignore if both $ca,$cb are outside of $arm ? next if ($stop1 < $armb || $start1 > $arme); $start= $start1 if ($start==$kMissingValue && $start1 >= $armb && $start1 <= $arme); $stop = $stop1 if ($stop1 >= $armb && $stop1 <= $arme); } # $start= $armb if ($start==$kMissingValue && $stop != $kMissingValue); $stop = $start if ($stop==$kMissingValue); ## was = $arme ##?? need band range here, for e.g. '41' => 41A1-41F29 # DEBUG - getting missing when should get real range !?? return (wantarray) ? ($start, $stop) : "$start..$stop"; } ## indexing parts from genomefeat.pl - fixed for changed sorsa.txt, other flyfeat parts ## ## index features*.tsv by ID field for lookup by id ## sub indexfeatdir { my $self= shift; my $dir= shift; local(*D); opendir(D, $dir) || warn "can't open $dir"; my @files= grep( /^features-\w+\.tsv$/, readdir(D)); closedir(D); local(*IMAP,*IMAPX); open(IMAP,">$dir/idmap.tsv"); open(IMAPX,">$dir/idmap.tsv.idx"); foreach my $file (sort @files) { my $sfile= catfile($dir, $file); my $csome= ($file =~ /^features-(\w+)/) ? $1 : 'UNK'; ## warn "indexing chr-$csome, $sfile\n" if $DEBUG; print $self->indexFeatures( $sfile, 'index', $csome); print $self->indexIds( $sfile, 'idindex', $csome, *IMAP, *IMAPX); } close(IMAP); close(IMAPX); } sub indexIds { my $self= shift; my($file, $kind, $csome, $idmapf, $idmapx)= @_; local(*FIN,*FIDX); my ( $n, $nl)= (0,0); $kind= 'idindex' unless($kind); my $idx= $file . ".idx"; die "Can't read $file" unless (open(FIN,$file)); die "Can't write $idx" unless (open(FIDX,">$idx")); # my $recsize = length(pack("NN", 1, 500)); ## a constant -- 2 long integers my %didid= (); my ($at,$ate)= (0,0); while (<FIN>) { $at = $ate; $ate= tell(FIN); if (/^\w/) { $nl++; chomp(); my ($class,$sym,$map,$range,$idv,$dbx)= split(/\t/); ## dang, for cytogene need to make $range from $map next if ($class =~ /$noIDmap/); ## ?? drop or keep next unless($class =~ /$indexidtype/); ## aug04 -- ID field has changed to CG, others - use dbx to get FBgn ID ## look at both fields for 1st valid numeric (FBgn) ID my @ids= (split(/[,;\s]/,$idv),split(/[,;\s]/,$dbx)); # dbx should be ',' now my $needid=1; IDINDEX: while ($needid && (my $tid = shift @ids)) { ## next unless($class =~ /gene|RNA/ || $tid =~ /gn\d+/); ## FIXME - need another index method to mix FBgn/FBan/FBxx next unless ($tid =~ /$indexidpattern/); if ($tid =~ s/([^:]+)://) { my $db= $1; ## skip not-our-id ids ## $tid= '' unless ($db =~ m/FlyBase|MEOW|euGenes/i); ## ? do we need to check db, if matches $indexidpattern ? } ##if ($tid =~ m/[A-Za-z]*0*(\d+)/) ## need config patt here ##if ($tid =~ m/[A-Z]{2}gn0*(\d+)/) if ($tid =~ m/0*(\d+)/) { #? $needid=0; -- keep going thru all dbx 2ndary FBgn IDS ???? my $idnum= int($1); if (!$didid{$tid} && $idnum < 200000) { # be sure is good idnum $didid{$tid}++; my $size= $ate - $at; my $record= pack("NN", $at, $size); my $idloc = $idnum * RECSIZE; seek(FIDX, $idloc, 0); ## check if already did id == e.g., several feats have same ID, pick 1st? always print FIDX $record; $n++; # ?? also write to single $org id-map.tsv ? and do .idx for it? if ($class =~ /cyto/ && $map && $range eq '-') { $range= $self->getMap2Bases( $map, $csome); } if ($idmapf && $range && $range ne '-') { my($start, $stop)= $self->maxrange($range); my $idat= tell($idmapf); print $idmapf "$tid\t$csome\t$start..$stop\n"; my $ide= tell($idmapf); $size= $ide - $idat; $record= pack("NN", $idat, $size); ##my $idloc = $idnum * RECSIZE; seek($idmapx, $idloc, 0); print $idmapx $record; } } ## last IDINDEX; } } } ##$at = $ate; } close(FIDX); close(FIN); return "indexIds $file = $n / $nl\n"; } ## ## index features*.tsv by base range ## sub indexFeatures { my $self= shift; my($file, $kind, $csome)= @_; local(*FIN,*FIDX); my $n= 0; $kind= 'index' unless($kind); ##return if ($kind ne 'index'); my $idx= $file . ".ranges"; die "Can't read $file" unless (open(FIN,$file)); die "Can't write $idx" unless (open(FIDX,">$idx")); print FIDX "# $idx\n"; print FIDX "# base range -> file index, and source, scaffold ranges\n"; print FIDX "# tab-separated-values of: \n"; print FIDX "# base-start | file-index OR class-name | file-index | location\n"; my $bindex= 0; ##? off by one? was 0; my $nextstep= -666; my $stepsize= 100000; my @csomerange= (0,0); while (<FIN>) { # my $blength= length($_); if (/^\w/) { chomp(); my ($class,$sym,$map,$range,$ids,$dbx)= split(/\t/); my ($start,$stop)= ($kMissingValue, $kMissingValue); if (defined $range && $range =~ /\d/) { ($start, $stop)= $self->maxrange($range); } elsif ($map =~ /\d/) { # !$range || $range eq '-' ($start, $stop)= $self->getMap2Bases( $map, $csome) ; # if ($org =~ /fly/); } if ($start!=$kMissingValue && $stop!=$kMissingValue) { if ($class eq 'source') { @csomerange= ($start, $stop); $stepsize= int( ($stop - $start) / 100 ); } if ($class eq 'source') { # || $class eq 'segment' ## segment not scaffold print FIDX "$class\t$bindex\t$range\n"; } elsif ($nextstep == -666 || $start >= $nextstep) { $nextstep= $start if ($nextstep == -666); $nextstep= 0 unless($nextstep); ## dang perl print FIDX "$nextstep\t$bindex\n"; $nextstep += $stepsize; } $n++; } } # $bindex += $blength; $bindex= tell(FIN); ##? off by one ?? } print FIDX "$csomerange[1]\t$bindex\n"; close(FIN); close(FIDX); return "indexFeatures $file = $n\n"; } 1; __END__ --- NEW FILE: BulkWriter.pm --- package Bio::GMOD::Bulkfiles::BulkWriter; use strict; =head1 NAME Bio::GMOD::Bulkfiles::BulkWriter >>> use ToFormat/ToFasta/ToGFF/.. instead ? =head1 SYNOPSIS base class for other Bulkfiles writers =head1 AUTHOR D.G. Gilbert, 2004, gil...@in... =head1 METHODS =cut #----------------- use POSIX; use FileHandle; use File::Spec::Functions qw/ catdir catfile /; use File::Basename; our $DEBUG = 0; my $VERSION = "1.0"; my $configfile= "bulkwriter"; sub new { my $that= shift; my $class= ref($that) || $that; my %fields = @_; my $self = \%fields; # config should be one bless $self, $class; $self->init_base(); return $self; } sub DESTROY { my $self = shift; } sub init_base { my $self= shift; $DEBUG= $self->{debug} if defined $self->{debug}; $self->{configfile}= $configfile unless defined $self->{configfile}; $self->{failonerror}= 0 unless defined $self->{failonerror}; $self->{skiponerror}= 1 unless defined $self->{skiponerror}; # new common caller arg: fileinfo is the fileset data # <fileset # name="fasta" # path="fasta/[\w\-\_]+.fasta" # title="Fasta sequence of features" # config="toFasta" # handler="FastaWriter" # type="fasta" # date="20040821" # dogzip="1" # etc="..." # /> unless(ref $self->{handler}) { if(ref $self->{sequtil}) { $self->{handler}= $self->{sequtil}; } elsif(ref $self->{bulkfiles}) { $self->{handler}= $self->{bulkfiles}; } else { die "Should make ->new(handler => Bio::GMOD::Bulkfiles object)"; } ##$self->{bulkfiles}= $self->{sequtil}= $self->{handler}; # dang namechange .. or 'bulkfiles' .. which ? } ## add these to %ENV before reading blastfiles.xml so ${vars} get replaced .. my $sconfig= $self->handler()->{config}; my @keys = qw( species org date title rel relfull relid release_url ); @ENV{@keys} = @{%$sconfig}{@keys}; $self->init(); # preconfig inits $self->readConfig($self->{configfile}); } =item init() Subclass: initialize at new() object, once only =cut sub init { my $self= shift; # for subclasses $DEBUG= $self->{debug} if defined $self->{debug}; # $self->{configfile}= $configfile unless defined $self->{configfile}; } sub config { return shift->{config}; } sub handler { return shift->{handler}; } #$self->{bulkfiles}= $self->{sequtil}= $self->{handler} sub sequtil { return shift->{handler}; } sub bulkfiles { return shift->{handler}; } =item readConfig($configfile) read a configuration file - adds to any loaded configs =cut sub readConfig { my $self= shift; my ($configfile)= @_; eval { ## >> no good unless(ref $self->{config2}) { $self->{config2}= $self->{sequtil}->{config2}; } unless(ref $self->{config2}) { require Bio::GMOD::Config2; $self->{config2}= Bio::GMOD::Config2->new( { searchpath => [ 'conf/bulkfiles', 'bulkfiles', 'conf' ], #gmod_root => $ROOT, #confdir => 'conf', ## << change to conf/bulkfiles ? #confpatt => '(gmod|[\w_\-]+db)\.conf$', } ); } $self->{config}= $self->{config2}->readConfig( $configfile); print STDERR $self->{config2}->showConfig( $self->{config}, { debug => $DEBUG }) if ($self->{showconfig}); }; if ($@) { my $cf= $self->{config2}->{filename}; warn "Config2: file=$cf; err: $@"; die if $self->{failonerror}; } $self->initData(); } =item initData Subclass: initialize config & other data after each readConfig =cut sub initData { my($self)= @_; my $config = $self->{config}; my $sconfig= $self->handler()->{config}; my $oroot= $sconfig->{rootpath}; ## use instead $self->handler()->{config} values here? $self->{org}= $sconfig->{org} || $config->{org} || 'noname'; $self->{rel}= $sconfig->{rel} || $config->{rel} || 'noname'; $self->{sourcetitle}= $sconfig->{title} || $config->{title} || 'untitled'; $self->{sourcefile} = $config->{input} || ''; $self->{date}= $sconfig->{date} || $config->{date} || POSIX::strftime("%d-%B-%Y", localtime( $^T )); } #-------------- subs ------------- =item makeFiles( %args ) primary method makes blast indices. input file sets are intermediate chado db dump tables. arguments: infiles => \@fileset, # required =cut sub makeFiles { my $self= shift; my %args= @_; print STDERR "makeFiles\n" if $DEBUG; # debug my $fileset = $args{infiles}; unless(ref $fileset) { warn "makeFiles: no infiles => \@filesets given"; return; } # my @seqfiles= $self->openInput( $fileset ); # my $res= $self->processBlastInput( \@seqfiles); return 1; #what? } =item openInput( $fileset ) handle input files =cut =item openInput( $fileset ) handle input files =cut sub openInput { my $self= shift; my( $fileset, $ipart, $intype )= @_; # do per-csome/name $intype= $self->{config}->{informat} unless ($intype); #? maybe array print STDERR "openInput: type=$intype part=$ipart \n" if $DEBUG; my $atpart= 0; foreach my $fs (@$fileset) { my $fp = $fs->{path}; my $name= $fs->{name}; my $type= $fs->{type}; next unless(!$intype || $fs->{type} =~ /$intype/); unless(-e $fp) { warn "missing infile $fp"; next; } $atpart++; next unless($atpart > $ipart); print STDERR "openInput: name=$name, type=$type, $fp\n" if $DEBUG; ## note: fileset maker probably already set these. my ( $org, $chr1, $featn, $rel, $format )= $self->split_filename($fp); $fs->{org}= $org; $fs->{chr}= $chr1 unless($fs->{chr}); $fs->{featn}= $featn; $fs->{rel}= $rel; $fs->{format}= $format; if ($fp =~ m/\.(gz|Z)$/) { open(INF,"gunzip -c $fp|"); $fs->{pipe}=1; } else { open(INF, $fp); } my $inh= *INF; $fs->{inh}= $inh; return $fs; } print STDERR "openInput: nothing matches part=$ipart\n" if $DEBUG; return undef; } sub resetInput { my $self= shift; my( $infile )= @_; my $inh= $infile->{inh}; my $fp = $infile->{path}; if ($infile->{pipe} || $fp =~ m/\.(gz|Z)$/) { close($inh) if $inh; open(INF,"gunzip -c $fp|"); $inh= *INF; $infile->{pipe}=1; } elsif (!$inh) { open(INF,$fp); $inh= *INF; } else { seek($inh,0,0); } $infile->{inh}= $inh; return $inh; } =item readIdsFromFFF pre-read ids from fff input for selected features for others to add_id or filter by id moved to base class for reuse =cut sub readIdsFromFFF { my $self= shift; my ($fffin,$chr,$config)= @_; my $idlist= {}; my $types_info= $config->{featmap}; my $nid=0; while(<$fffin>) { next unless(/^\w/); chomp; my ($type,$name,$cytomap,$baseloc,$id,$dbxref,$notes,$chr1) = $self->handler()->splitFFF($_, $chr); if ($types_info->{$type}->{get_id}) { $idlist->{$id}= $dbxref; $nid++; } } print STDERR "read ids n=$nid\n" if $DEBUG; return $idlist; } =item get_filename $fname= get_filename( $org, $chr, $featn, $rel, $format) make standard output file name "${org}_${chr}_${featn}_${rel}.${format}" =cut sub get_filename { return shift->handler()->get_filename( @_); } =item split_filename ( $org, $chr, $featn, $rel, $format)= split_filename( $fname) return parts of standard output file name "${org}_${chr}_${featn}_${rel}.${format}" =cut sub split_filename { return shift->handler()->split_filename( @_); } =item isold($source,$target) is source file older than target? ## not for symlinks or dirs =cut sub isold { my $self= shift; my($source,$target) = @_; if (! -e $target) { return 1; } my $res= 0; my $targtime= -M $target; ## -M is file age in days.hrs before now if ( -l $source ) { # $source= getOriginal($source); $res= (-M $source) < $targtime; } elsif ( -e $source ) { $res= (-M $source) < $targtime; } else { $res= 0; } return $res; } 1; __END__ Index: ToFormat.pm =================================================================== RCS file: /cvsroot/gmod/schema/GMODTools/lib/Bio/GMOD/Bulkfiles/ToFormat.pm,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ToFormat.pm 23 Aug 2004 06:44:41 -0000 1.1 --- ToFormat.pm 30 Aug 2004 04:51:28 -0000 1.2 *************** *** 4,8 **** =head1 NAME ! Bio::GMOD::Bulkfiles::ToFormat -- interface class =cut --- 4,8 ---- =head1 NAME ! Bio::GMOD::Bulkfiles::ToFormat -- base class for other Bulkfiles writers =cut *************** *** 11,16 **** --- 11,23 ---- use lib("/bio/biodb/common/perl/lib", "/bio/biodb/common/system-local/perl/lib"); + use POSIX; + use FileHandle; + use File::Spec::Functions qw/ catdir catfile /; + use File::Basename; + our $DEBUG = 0; my $VERSION = "1.0"; + my $configfile= "bulkwriter"; + sub new *************** *** 21,48 **** my $self = \%fields; # handler, outh should be there bless $self, $class; ! $self->init(); return $self; } ! sub init { my $self= shift; ! unless(ref $self->{handler}) { ! warn "need handler => param"; $self->{handler}= {}; } unless(ref $self->{outh}) { ! warn "need output handle"; $self->{outh}= *STDOUT; } ! $DEBUG= $self->{debug} if defined $self->{debug}; } ! sub DESTROY { ! my $self = shift; ! ## $self->closeit(); close($self->{outh}); #?? ! ## $self->SUPER::DESTROY(); } ! =item get_filename( $org, $chr, $featn, $rel, $format) make standard output file name "${org}_${chr}_${featn}_${rel}.${format}" --- 28,130 ---- my $self = \%fields; # handler, outh should be there bless $self, $class; ! $self->init_base(); return $self; } ! sub DESTROY ! { ! my $self = shift; ! } ! ! sub init_base ! { my $self= shift; ! $DEBUG= $self->{debug} if defined $self->{debug}; ! ## $self->{configfile}= $configfile unless defined $self->{configfile}; ! ! unless(ref $self->{handler}) { ! if(ref $self->{bulkfiles}) { $self->{handler}= $self->{bulkfiles}; } ! else { die "Should make ->new(handler => Bio::GMOD::Bulkfiles object)"; } } unless(ref $self->{outh}) { ! warn "need outh => output-handle"; $self->{outh}= *STDOUT; } ! ! $self->init(); # preconfig inits ! $self->readConfig($self->{configfile}); } ! ! =item init() ! ! Subclass: initialize at new() object, once only ! ! =cut ! ! sub init { ! my $self= shift; ! ! $self->{configfile}= $configfile unless defined $self->{configfile}; ! # for subclasses } ! sub config { return shift->{config}; } ! ! =item readConfig($configfile) ! ! read a configuration file - adds to any loaded configs ! ! =cut ! ! sub readConfig ! { ! my $self= shift; ! my ($configfile)= @_; ! eval { ! ## >> no good unless(ref $self->{config2}) { $self->{config2}= $self->{sequtil}->{config2}; } ! unless(ref $self->{config2}) { ! require Bio::GMOD::Config2; ! $self->{config2}= Bio::GMOD::Config2->new( { ! #gmod_root => $ROOT, ! #confdir => 'conf', ## << change to conf/bulkfiles ? ! #confpatt => '(gmod|[\w_\-]+db)\.conf$', ! } ); ! } ! ! $self->{config}= $self->{config2}->readConfig( $configfile); ! print STDERR $self->{config2}->showConfig( $self->{config}, { debug => $DEBUG }) ! if ($self->{showconfig}); ! }; warn "Config2 err: $@" if ($@); ! ! $self->initData(); ! } + + =item initData + + Subclass: initialize config & other data after each readConfig + + =cut + + sub initData + { + my($self)= @_; + my $config = $self->{config}; + my $sconfig= $self->{sequtil}->{config}; + my $oroot= $sconfig->{rootpath}; + + ## use instead $self->{sequtil}->{config} values here? + $self->{org}= $sconfig->{org} || $config->{org} || 'noname'; + $self->{rel}= $sconfig->{rel} || $config->{rel} || 'noname'; + $self->{sourcetitle}= $sconfig->{title} || $config->{title} || 'untitled'; + $self->{sourcefile} = $config->{input} || ''; + $self->{date}= $sconfig->{date} || $config->{date} || POSIX::strftime("%d-%B-%Y", localtime( $^T )); + } + + + =item get_filename + + $fname= get_filename( $org, $chr, $featn, $rel, $format) make standard output file name "${org}_${chr}_${featn}_${rel}.${format}" *************** *** 51,66 **** sub get_filename { my $self= shift; ! my( $org, $chr, $featn, $rel, $format)= @_; ! if ( $featn ) { $featn="_${featn}"; } else { $featn=''; } ! if ( $chr ) { $chr="_${chr}"; } else { $chr=''; } ! if ( $rel ) { $rel="_${rel}"; } else { $rel=''; } ! if (! $format ) { $format="undef"; } ! #?? leave to later# elsif ($format eq 'fff') { $format= 'tsv'; } # preserve old naming ?? ! my $filename="${org}${chr}${featn}${rel}.${format}"; ! return $filename; } sub writeheader { --- 133,183 ---- sub get_filename { + return shift->{sequtil}->get_filename( @_); + } + + =item split_filename + + ( $org, $chr, $featn, $rel, $format)= split_filename( $fname) + return parts of standard output file name "${org}_${chr}_${featn}_${rel}.${format}" + + =cut + + sub split_filename + { + return shift->{sequtil}->split_filename( @_); + } + + + =item isold($source,$target) + + is source file older than target? + + =cut + + sub isold + { my $self= shift; ! my($source,$target) = @_; ! ## not for symlinks or dirs ! my $res= 0; ! my $targtime= -M $target; ## -M is file age in days.hrs before now ! if (! -f $target) { ! return 1; ! } ! elsif ( -l $source ) { ! # $source= getOriginal($source); ! $res= (-M $source) < $targtime; ! } ! elsif ( -f $source ) { ! $res= (-M $source) < $targtime; ! } ! else { $res= 0; } ! return $res; } + #------- writing methods ----------- + + sub writeheader { *************** *** 105,108 **** --- 222,227 ---- } + + #------------- 1; --- NEW FILE: FastaWriter.pm --- package Bio::GMOD::Bulkfiles::FastaWriter; use strict; =head1 NAME Bio::GMOD::Bulkfiles::FastaWriter =head1 SYNOPSIS use Bio::GMOD::Bulkfiles; my $sequtil= Bio::GMOD::Bulkfiles->new( configfile => 'seqdump-r4', ); my $fwriter= $sequtil->getFastaWriter(); my $result = $fwriter->makeFiles( ); =head1 NOTES genomic sequence file utilities, part3; parts from flybase/work.local/chado_r3_2_26/soft/chado2flat2.pl =head1 AUTHOR D.G. Gilbert, 2004, gil...@in... =head1 METHODS =cut #----------------- # debug use lib("/bio/biodb/common/perl/lib", "/bio/biodb/common/system-local/perl/lib"); use POSIX; use FileHandle; use File::Spec::Functions qw/ catdir catfile /; use File::Basename; use vars qw(@ISA); use Bio::GMOD::Bulkfiles::BulkWriter; @ISA= (qw/Bio::GMOD::Bulkfiles::BulkWriter/); ## want interface class our $DEBUG = 0; my $VERSION = "1.0"; my $configfile= "fastawriter"; #? BulkFiles/FastaWriter.xml sub init { my $self= shift; $self->SUPER::init(); $DEBUG= $self->{debug} if defined $self->{debug}; $self->{configfile}= $configfile unless defined $self->{configfile}; } =item initData initialize data from config =cut sub initData { my($self)= @_; $self->SUPER::initData(); # my $config = $self->{config}; # my $sconfig= $self->handler()->{config}; ## fastafiles my $finfo= $self->{fileinfo} || $self->handler()->getFilesetInfo('fasta'); my $fastadir= $self->handler()->getReleaseSubdir( $finfo->{path} || 'fasta/'); $self->{fastadir} = $fastadir; $self->{addids} = $finfo->{addids}; $self->{dropnotes} = $finfo->{dropnotes}; $self->{allowanyfeat} = $finfo->{allowanyfeat}; $self->{makeall} = $finfo->{makeall}; $self->{dogzip} = $finfo->{dogzip}; $self->{perchr} = $finfo->{perchr}; } #-------------- subs ------------- =item makeFiles( %args ) primary method arguments: infiles => \@fileset of fff features and dna sequences, # required =cut sub makeFiles { my $self= shift; my %args= @_; print STDERR "FastaWriter::makeFiles\n" if $DEBUG; # debug # more sensible that writer should ask handler for kind of files it wants my $fileset = $args{infiles}; my $chromosomes = $args{chromosomes}; unless(ref $fileset) { my $intype= $self->{config}->{informat} || 'fff'; #? maybe array $fileset = $self->handler->getFiles($intype, $chromosomes); # warn "makeFiles: no infiles => \@filesets given"; return; } my $featset= $self->handler->{config}->{featset} || []; #? or default set ? my $addids = defined $args{addids} ? $args{addids} : $self->{addids}; my $ok= 1; for (my $ipart= 0; $ok; $ipart++) { $ok= 0; my $infile= $self->openInput( $fileset, $ipart); if ($infile && $infile->{inh}) { my $inh= $infile->{inh}; my $chr= $infile->{chr}; if ($addids) { my $idlist= $self->readIdsFromFFF( $inh, $chr, $self->handler()->{config}); # for featmap ? $self->{idlist}= $idlist; $inh= $self->resetInput($infile); #seek($inh,0,0); ## cant do on STDIN ! cant do on PIPE ! } ## need to know $chr here .. from $fileset infile my $res= $self->processFasta( $inh, $chr, $featset); close($inh); delete $infile->{inh}; $ok= 1; } } print STDERR "FastaWriter::makeFiles: done\n" if $DEBUG; return 1; #what? } =item openInput( $fileset ) handle input files .. copied to base class =cut sub openInput { my $self= shift; my( $fileset, $ipart )= @_; # do per-csome/name my $intype= $self->{config}->{informat} || 'fff'; #? maybe array my $atpart= 0; print STDERR "openInput: type=$intype part=$ipart \n" if $DEBUG; foreach my $fs (@$fileset) { my $fp = $fs->{path}; my $name= $fs->{name}; my $type= $fs->{type}; next unless( $fs->{type} =~ /$intype/); # could it be 'dna/fasta', 'amino/fasta' ? unless(-e $fp) { warn "missing infile $fp"; next; } $atpart++; next unless($atpart > $ipart); print STDERR "openInput: name=$name, type=$type, $fp\n" if $DEBUG; my ( $org, $chr1, $featn, $rel, $format )= $self->split_filename($fp); $fs->{org}= $org; $fs->{chr}= $chr1 unless($fs->{chr}); $fs->{featn}= $featn; $fs->{rel}= $rel; $fs->{format}= $format; if ($fp =~ m/\.(gz|Z)$/) { open(INF,"gunzip -c $fp|"); $fs->{pipe}=1; } else { open(INF, $fp); } my $inh= *INF; $fs->{inh}= $inh; return $fs; } print STDERR "openInput: nothing matches part=$ipart\n" if $DEBUG; return undef; } sub resetInput { my $self= shift; my( $infile )= @_; my $inh= $infile->{inh}; my $fp = $infile->{path}; if ($infile->{pipe} || $fp =~ m/\.(gz|Z)$/) { close($inh) if $inh; open(INF,"gunzip -c $fp|"); $inh= *INF; $infile->{pipe}=1; } elsif (!$inh) { open(INF,$fp); $inh= *INF; } else { seek($inh,0,0); } $infile->{inh}= $inh; return $inh; } =item processFasta =cut sub processFasta { my $self= shift; my( $inh, $chr, $featset )= @_; my $ndone= 0; my $outh= {}; my $fastadir= $self->{fastadir}; my @features= @$featset; # special case for feat == chromosome/dna -> raw2Fasta my @fffeatures= @features; if (my ($featn)= grep /^chromosome/, @features) { my $fn= $self->get_filename ( $self->{org}, $chr, 'chromosome', $self->{rel}, 'fasta'); $fn= catfile($fastadir, $fn); $self->raw2Fasta( chr => $chr, fastafile => $fn); $ndone++; @fffeatures= grep !/^chromosome/, @features; } foreach my $featn (@fffeatures) { my $fn= $self->get_filename ( $self->{org}, $chr, $featn, $self->{rel}... [truncated message content] |