From: <fa...@us...> - 2011-10-28 14:06:18
|
Revision: 2386 http://tab2mage.svn.sourceforge.net/tab2mage/?rev=2386&view=rev Author: farne Date: 2011-10-28 14:06:07 +0000 (Fri, 28 Oct 2011) Log Message: ----------- Natalja's changes to support import of data from Sanger Modified Paths: -------------- branches/ae-dev/Tab2MAGE/lib/ArrayExpress/SRA_XML/SRAtoMAGETAB.pm Modified: branches/ae-dev/Tab2MAGE/lib/ArrayExpress/SRA_XML/SRAtoMAGETAB.pm =================================================================== --- branches/ae-dev/Tab2MAGE/lib/ArrayExpress/SRA_XML/SRAtoMAGETAB.pm 2011-10-28 13:47:57 UTC (rev 2385) +++ branches/ae-dev/Tab2MAGE/lib/ArrayExpress/SRA_XML/SRAtoMAGETAB.pm 2011-10-28 14:06:07 UTC (rev 2386) @@ -198,37 +198,52 @@ $self->get_investigation->set_title($title); # Set experiment type - my $study_type = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_TYPE}->{existing_study_type}; - my $efo_type = $EFO_STUDY_TYPE_FOR{$study_type}; - if ($efo_type){ - $study_type = $efo_type; - } - else{ - if ($study_type eq "Other"){ - $study_type = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_TYPE}->{new_study_type} - } - $self->log("Using study type not mapped to EFO - $study_type", "warn"); - } - #Added by Natalja - if (length($study_type)==0){ - $study_type = "transcription profiling by high throughput sequencing"; - } - $self->_add_expt_comment("AEExperimentType", $study_type); - - # get additional study types from STUDY_ATTRIBUTE - # FIXME: TAG for these has not yet been agreed + #my $study_type = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_TYPE}->{existing_study_type}; + + my $value = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_TYPE}->{existing_study_type}; + my @design_types; - foreach my $att (grep { $_->{TAG} eq $STUDY_TYPE_TAG} - @{ $xml->{STUDY}->{STUDY_ATTRIBUTES}->{STUDY_ATTRIBUTE} || [] }){ - my $value = $att->{VALUE}; + #foreach my $att (grep { $_->{TAG} eq $STUDY_TYPE_TAG} + # @{ $xml->{STUDY}->{STUDY_ATTRIBUTES}->{STUDY_ATTRIBUTE} || [] }){ + # my $value = $att->{VALUE}; my $oe = Bio::MAGETAB::ControlledTerm->new({ category => "ExperimentDesignType", value => $value, }); push @design_types, $oe; - } + #} $self->get_investigation->set_designTypes(\@design_types); + + #my $efo_type = $EFO_STUDY_TYPE_FOR{$study_type}; + #if ($efo_type){ + # $study_type = $efo_type; + #} + #else{ + # if ($study_type eq "Other"){ + # $study_type = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_TYPE}->{new_study_type} + # } + # $self->log("Using study type not mapped to EFO - $study_type", "warn"); + #} + #Added by Natalja + #if (length($study_type)==0){ + # $study_type = "transcription profiling by high throughput sequencing"; + #} + #$self->_add_expt_comment("AEExperimentType", $study_type); + # get additional study types from STUDY_ATTRIBUTE + # FIXME: TAG for these has not yet been agreed + #my @design_types; + #foreach my $att (grep { $_->{TAG} eq $STUDY_TYPE_TAG} + # @{ $xml->{STUDY}->{STUDY_ATTRIBUTES}->{STUDY_ATTRIBUTE} || [] }){ + # my $value = $att->{VALUE}; + # my $oe = Bio::MAGETAB::ControlledTerm->new({ + # category => "ExperimentDesignType", + # value => $value, + # }); + # push @design_types, $oe; + #} + #$self->get_investigation->set_designTypes(\@design_types); + # Set description text my $abstract = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_ABSTRACT}; my $description = $xml->{STUDY}->{DESCRIPTOR}->{STUDY_DESCRIPTION}; @@ -271,6 +286,10 @@ $self->get_investigation->set_publicReleaseDate(substr($date,0,10)); my $e_mail = ""; + + if (!($lab)){ + $lab = "Wellcome Trust Sanger Institute"; + } if (trim($lab)=="Wellcome Trust Sanger Institute"){ $e_mail = "datahose\@sanger.ac.uk"; @@ -294,11 +313,46 @@ } sub parse_sample_xml{ + my %design_type_mapping = ( + 'ArrayExpress-Genotype' => 'genotype design', + 'ArrayExpress-Phenotype' => 'observational design', + 'ArrayExpress-DevelopmentalStage' => 'development or differentiation design', + 'ArrayExpress-CellType' => 'cell type comparison design', + 'ArrayExpress-DiseaseState' => 'disease state design', + 'ArrayExpress-Compound' => 'compound treatment design', + 'ArrayExpress-Immunoprecipitate' => 'binding site identification design', + 'ArrayExpress-GrowthCondition' => 'growth condition design', + 'ArrayExpress-RNAi' => 'cellular modification design', + 'ArrayExpress-OrganismPart' => 'organism part comparison design', + 'ArrayExpress-Species' => 'species design', + 'ArrayExpress-TimePoint' => 'time series design', + ); + + my %factor_type_mapping = ( + 'ArrayExpress-Genotype' => 'genotype', + 'ArrayExpress-Phenotype' => 'phenotype', + 'ArrayExpress-DevelopmentalStage' => 'developmental_stage', + 'ArrayExpress-CellType' => 'cell_type', + 'ArrayExpress-DiseaseState' => 'disease_state', + 'ArrayExpress-Compound' => 'compound', + 'ArrayExpress-Immunoprecipitate' => 'immunoprecipitate', + 'ArrayExpress-GrowthCondition' => 'growth_condition', + 'ArrayExpress-RNAi' => 'rnai', + 'ArrayExpress-OrganismPart' => 'organism_part', + 'ArrayExpress-Species' => 'organism', + 'ArrayExpress-TimePoint' => 'time', + ); + + my ($self, $sample_xml) = @_; my $xml = XMLin($sample_xml, ForceArray => \@force_array); + my @design_types; + + my %fv_valuecount; + foreach my $sample(@{ $xml->{SAMPLE} || [] }){ my $acc = $sample->{accession}; $self->log("Found sample accession: $acc"); @@ -306,6 +360,10 @@ name => $acc }); $self->_add_comment("ENA_SAMPLE", $acc, $source); + + my $alias = $sample->{alias}; + $self->_add_comment("ALIAS", $alias, $source); + my $descr = $sample->{DESCRIPTION}; $source->set_description($descr) if ($descr and ref($descr) eq "SCALAR"); @@ -317,31 +375,70 @@ $self->_add_comment("INDIVIDUAL_NAME", $sample->{SAMPLE_NAME}->{INDIVIDUAL_NAME}, $source) if $sample->{SAMPLE_NAME}->{INDIVIDUAL_NAME}; - my @chars; - my $taxon_id = $sample->{SAMPLE_NAME}->{TAXON_ID}; - my $species = $sample->{SAMPLE_NAME}->{SCIENTIFIC_NAME}; - if ($species or $taxon_id){ - my $species_attr = $self->create_species_attribute($species, $taxon_id); - push @chars, $species_attr; - } - else{ - $self->log("no species found for sample $acc","warn"); - } - - foreach my $attr ( @{ $sample->{SAMPLE_ATTRIBUTES}->{SAMPLE_ATTRIBUTE} || [] }){ + my @chars; + my $organism = 0; + foreach my $attr ( @{ $sample->{SAMPLE_ATTRIBUTES}->{SAMPLE_ATTRIBUTE} || [] }){ my $category = $attr->{TAG}; next if $category =~ /^ENA-.*/; my $value = $attr->{VALUE}; - next if $value !=~ /[A-Z][a-z][a-z]*[0-9][0-9]/; - my $char = Bio::MAGETAB::ControlledTerm->new({ - category => $category, - value => $attr->{VALUE}, - }); - push @chars, $char; + next if (!($value =~ /^(\w+\s*)+$/)); + next if lc($value) =~ /^not applicable$/; + if (exists($design_type_mapping{$category})){ + $fv_valuecount{$category}{$value}++; + $category = $factor_type_mapping{$category}; + } + $self->log("Category: $category Value: $value"); + if ((($category eq "phenotype") or ($category eq "genotype")) and $value eq "wt"){ + $value = "wild_type"; + } + if ($category eq "organism"){ + $organism = 1; + my $taxon_id = $sample->{SAMPLE_NAME}->{TAXON_ID}; + if ($value or $taxon_id){ + my $species_attr = $self->create_species_attribute($value, $taxon_id); + push @chars, $species_attr; + } + } + else{ + my $char = Bio::MAGETAB::ControlledTerm->new({ + category => $category, + value => $value, + }); + push @chars, $char; + } + + } $source->set_characteristics(\@chars); push @{ $self->get_all_nodes }, $source; + + if ($organism eq 0){ + my $taxon_id = $sample->{SAMPLE_NAME}->{TAXON_ID}; + my $species = $sample->{SAMPLE_NAME}->{SCIENTIFIC_NAME}; + if ($species or $taxon_id){ + my $species_attr = $self->create_species_attribute($species, $taxon_id); + push @chars, $species_attr; + } + else{ + $self->log("no species found for sample $acc","warn"); + } + } } + + #add study designs + while ( my ( $cat, $valhash ) = each %fv_valuecount ) { + my %hash = %{$valhash}; + + if ( scalar(grep { defined $_ } values %{ $valhash }) >1 or ($cat =~ /Immunoprecipitate/i)) { + my $oe = Bio::MAGETAB::ControlledTerm->new({ + category => "ExperimentDesignType", + value => $design_type_mapping{$cat}, + }); + push @design_types, $oe; + } + } + + $self->get_investigation->set_designTypes(\@design_types); } @@ -360,6 +457,8 @@ value => "high_throughput_sequencing", }); + my %study_type = ""; + my $paired = 0; foreach my $expt (@{ $xml->{EXPERIMENT} || [] }){ my $acc = $expt->{accession}; @@ -387,6 +486,20 @@ }); # Add library descriptor paramters as Extract Comments + foreach my $type qw(LIBRARY_STRATEGY){ + my $value = $expt->{DESIGN}->{LIBRARY_DESCRIPTOR}->{$type}; + $value = trim($value); + if ($value =~ m/RNA-Seq/){ + $value = "transcription profiling by high throughput sequencing"; + } + if (!exists($study_type{$value})){ + $study_type{$value} = $value; + } + + } + + + foreach my $type qw(LIBRARY_STRATEGY LIBRARY_SOURCE LIBRARY_SELECTION){ my $value = $expt->{DESIGN}->{LIBRARY_DESCRIPTOR}->{$type}; $self->_add_comment($type, $value, $extract); @@ -398,6 +511,7 @@ $self->_add_comment($type, $value, $extract); } if ($expt->{DESIGN}->{LIBRARY_DESCRIPTOR}->{LIBRARY_LAYOUT}->{PAIRED}){ + $paired = 1; $value = "PAIRED"; $self->_add_comment($type, $value, $extract); my $type_paired = "ORIENTATION"; @@ -415,6 +529,22 @@ $value = $expt->{DESIGN}->{LIBRARY_DESCRIPTOR}->{LIBRARY_LAYOUT}->{PAIRED}->{$type_paired}; $self->_add_comment($type_paired, $value, $extract); } + + #READ_SPEC for PAIRED READS + foreach my $read (@{ $expt->{"DESIGN"}->{"SPOT_DESCRIPTOR"}->{"SPOT_DECODE_SPEC"}->{"READ_SPEC"} || [] }){ + my $type_paired = "READ_TYPE"; + if ($read->{$type_paired}){ + my $value = $read->{$type_paired}; + if ($value eq "BarCode"){ + $type_paired = "EXPECTED_BASECALL_TABLE"; + if ($read->{$type_paired}->{BASECALL}->{content}){ + $value = $read->{$type_paired}->{BASECALL}->{content}; + $self->_add_comment("BARCODE", $value, $extract); + } + } + } + } + } } @@ -523,6 +653,7 @@ my $fastq_path = trim($entries[16]); my $run_acc = trim($entries[3]); $all_study_runs{$run_acc} = 1; + my $data = Bio::MAGETAB::DataAcquisition->new({ name => $run_acc, @@ -560,9 +691,91 @@ # push @{ $self->get_all_nodes }, $data; # } #} + + foreach my $run_acc (@{ $links->{"ENA-RUN"} || [] }){ + # Add scan + if (!(exists($all_study_runs{$run_acc}))){ + $all_study_runs{$run_acc} = 1; + if ($paired==1){ + my $fastq_path = substr($run_acc,0,6); + my $scan_name = $run_acc."_1"; + my $data_1 = Bio::MAGETAB::DataAcquisition->new({ + name => $scan_name, + }); + + + $self->_add_comment("FASTQ_URI", "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/$fastq_path/$run_acc/$scan_name.fastq.gz", $data_1); + $self->_add_comment("ENA_RUN", $run_acc, $data_1); + my $data_edge = Bio::MAGETAB::Edge->new({ + inputNode => $assay, + outputNode => $data_1, + }); + + $scan_name = $run_acc."_2"; + my $data_2 = Bio::MAGETAB::DataAcquisition->new({ + name => $scan_name, + }); + + $self->_add_comment("FASTQ_URI", "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/$fastq_path/$run_acc/$scan_name.fastq.gz", $data_2); + $self->_add_comment("ENA_RUN", $run_acc, $data_2); + my $data_edge = Bio::MAGETAB::Edge->new({ + inputNode => $assay, + outputNode => $data_2, + }); + + + #READ_SPEC for PAIRED READS + foreach my $read (@{ $expt->{"DESIGN"}->{"SPOT_DESCRIPTOR"}->{"SPOT_DECODE_SPEC"}->{"READ_SPEC"} || [] }){ + + my $type_paired = "READ_TYPE"; + if ($read->{$type_paired}){ + my $value = $read->{$type_paired}; + if ($value eq "Forward") { + $self->_add_comment($type_paired, $value, $data_1); + $type_paired = "BASE_COORD"; + if ($read->{$type_paired}){ + $value = $read->{$type_paired}; + $self->_add_comment($type_paired, $value, $data_1); + } + } + if ($value eq "Reverse"){ + $self->_add_comment($type_paired, $value, $data_2); + $type_paired = "BASE_COORD"; + if ($read->{$type_paired}){ + $value = $read->{$type_paired}; + $self->_add_comment($type_paired, $value, $data_2); + } + } + } + } + push @{ $self->get_all_nodes }, $data_1; + push @{ $self->get_all_nodes }, $data_2; + } + else { + my $data = Bio::MAGETAB::DataAcquisition->new({ + name => $run_acc, + }); + + my $fastq_path = substr($run_acc,1,6); + $self->_add_comment("FASTQ_URI", "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/$fastq_path/$run_acc/$run_acc.fastq.gz", $data); + $self->_add_comment("ENA_RUN", $run_acc, $data); + my $data_edge = Bio::MAGETAB::Edge->new({ + inputNode => $assay, + outputNode => $data, + }); + push @{ $self->get_all_nodes }, $data; + } + } + } + } + + while( my ($k, $v) = each(%study_type)) { + $self->_add_expt_comment("AEExperimentType", $k); + } + $self->log("Adding run list to IDF"); my $runs = get_range_from_list("ERR", keys %all_study_runs); $self->_add_expt_comment("SequenceDataURI", "http://www.ebi.ac.uk/ena/data/view/$runs"); @@ -740,7 +953,7 @@ # If a category has more than 1 value make it a factor foreach my $type (keys %types_to_values){ - #if ( (keys %{ $types_to_values{$type} }) > 1){ + if ( ((keys %{ $types_to_values{$type} }) > 1) or ($type =~ /Immunoprecipitate/i)){ $self->log("Adding $type factor values"); @@ -776,7 +989,7 @@ $row->set_factorValues( [ @{ $existing || [] }, $fv ] ); } } - #} + } } return; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |