Thursday, December 17, 2015

Perl: Retrieve sequences


Abstract: Retrieve DNA/protein sequence and their annotations from the databases:GenBank, Swissprot, and EntrezGene.

Here are the three Perl functions used for retrieving annotations from GenBank, Swissprot and EntrezGene using BioPerl.


use Bio::SeqIO;
use Bio::DB::GenBank;
use Bio::DB::Query::GenBank;
use Bio::DB::SwissProt;
use Bio::DB::EntrezGene;

####################
sub GenBank_by_acc{#1
my ($p, $accs_pointer, $out_dir)=@_;
my @accs=@$accs_pointer;
my $n=0;
#GenBank query;
my $db_obj = Bio::DB::GenBank->new;
foreach my $acc(@accs){#2
$n++;
my $query_file=$out_dir.$acc;
if (-f $query_file){
#printf( "\tskip %d: %s\n", $p+$n, $acc);
}
else{
my %rna_info;
my $seq = $db_obj->get_Seq_by_id($acc);
$rna_info{description}=$seq->desc;
$rna_info{rna_len}=$seq->length;
foreach my $feat ( $seq->get_SeqFeatures() ) { #3 features circyling
my %feature;
my $tag_name=$feat->primary_tag();
$feature{name}=$tag_name;
#get feature's info
foreach my $tag ( $feat->get_all_tags() ) { #4
my $tag_value=join("", $feat->get_tag_values($tag));
if($tag=~/db_xref/){
my($a, $b)=split(":", $tag_value);
$feature{$a}=$b;
}
else{ $feature{$tag}=$tag_value; }
}#4
#extract source_features
if ( $tag_name=~/source/ ) { #4
$rna_info{chromosome}=exists $feature{chromosome} ? $feature{chromosome}: 'NA';
$rna_info{map}=exists $feature{map} ? $feature{map} : 'NA';
$rna_info{organism}=exists $feature{organism} ? $feature{organism} : 'NA';
$rna_info{mol_type}=$feature{mol_type} if exists $feature{mol_type};
}#4
elsif ( $tag_name=~/gene/ ) { #4
$rna_info{gene}= exists $feature{gene} ? $feature{gene} : 'NA';
$rna_info{GeneID}= exists $feature{GeneID} ? $feature{GeneID} : 'NA';
$rna_info{gene_synonym}=exists $feature{gene_synonym} ? $feature{gene_synonym} : 'NA';
$rna_info{note}=exists $feature{note} ? $feature{note} : 'NA';
}#4
elsif ( $tag_name=~/CDS/ ) { #4
$rna_info{GI}= exists $feature{GI} ? $feature{GI} : 'NA';
$rna_info{product}= exists $feature{product} ? $feature{product} : 'NA';
$rna_info{protein_id}= exists $feature{protein_id} ? $feature{protein_id} : 'NA';
$rna_info{EC_number}= exists $feature{EC_number} ? $feature{EC_number} : 'NA';
}
}#3
#export
sub_common::hash_to_file(\%rna_info, $query_file, '=');
print $p+$n, ":$acc\n";
}
}#2
}

######
sub SwissProt_by_acc{
my ($p, $transcript_ids_pointer, $uniprot_accs_hash_pointer, $out_dir)=@_;
my @transcript_ids=@$transcript_ids_pointer;
my %uniprot_accs_hash=%$uniprot_accs_hash_pointer;
my $n=0;
#
my @feature_tags=('KEGG','GO','InterPro');
my$sp_obj = Bio::DB::SwissProt->new();
foreach my $transcript_id(@transcript_ids){#2
my %pro_info;
my $uniprot_acc_str=$uniprot_accs_hash{$transcript_id};
my @accs=split(',', $uniprot_acc_str);
foreach my $acc(@accs){#3
eval{
my $seq_obj;
if ($seq_obj=$sp_obj->get_Seq_by_acc($acc) ){
$pro_info{pro_len}=$seq_obj->length;
#get annotations
my $anno_collection=$seq_obj->annotation;
foreach my $key ( $anno_collection->get_all_annotation_keys ) { #4 features circyling
#print "$key\n";
my @annotations=$anno_collection->get_Annotations($key);
#print "@annotations\n";
for my $value(@annotations){#5
#printf("\t%s###%s\n", $value->tagname, $value->display_text);
if ($value->tagname eq 'dblink'){#6
my@a=split(":", $value->display_text);
my $f_tag=shift @a;
my $f_value=join(':', @a);
#printf("\t%s###%s\t%s\n", $value->tagname, $f_tag, $f_value);
$pro_info{$f_tag}= $pro_info{$f_tag} ? $pro_info{$f_tag}.','.$f_value : $f_value;
}#6
}#5
}#4
}
};
printf("No %s found\n", $acc) if $@; #return error when query failure
}#3
#export
foreach my $tag_name(@feature_tags){#3
my $tag_value='NA';
if ($pro_info{$tag_name}){
my @values=split(',', $pro_info{$tag_name});
@values=List::MoreUtils::uniq @values;
$tag_value=join(',', @values);
}
sub_basic::refresh_log($out_dir.$transcript_id, $tag_name, $tag_value);
printf("%d:%s\t%s\n", $p+$n, $transcript_id, $tag_value);
}#3
$n++;
}#2
}

##########
sub EntrezGene_by_acc{
my ($p, $transcript_ids_pointer, $out_dir)=@_;
my @transcript_ids=@$transcript_ids_pointer;
my $n=0;
#
my$query_obj = Bio::DB::EntrezGene->new();
foreach my $transcript_id(@transcript_ids){#2
eval{
my $seq_obj;
if ($seq_obj=$query_obj->get_Seq_by_acc($transcript_id) ){
my %feature;
#get annotations
my $anno_collection=$seq_obj->annotation;
foreach my $key ( $anno_collection->get_all_annotation_keys ) { #4 features circyling
#print "$key\n";
my @annotations=$anno_collection->get_Annotations($key);
#print "@annotations\n";
for my $value(@annotations){#5
#printf("\t%s###%s\n", $value->tagname, $value->display_text);
if ($value->tagname eq 'Function'){#6
my $GO_term='GO:'.$value->display_text;
my $GO_cat='GO_MF';
$feature{$GO_cat}= $feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif ($value->tagname eq 'Process'){#6
my $GO_term='GO:'.$value->display_text;
my $GO_cat='GO_BP';
$feature{$GO_cat}= $feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif ($value->tagname eq 'Component'){#6
my $GO_term='GO:'.$value->display_text;
my $GO_cat='GO_CC';
$feature{$GO_cat}= $feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif (List::Util::first {$value->tagname eq $_} ('MIM', 'HPRD', 'HGNC', 'Exon count', 'Ensembl') ){#6
$feature{$value->tagname}=$value->display_text;
}#6
}#5
}#4
$feature{'GO_Entrez'} = $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','. $feature{'GO_BP'} : $feature{'GO_BP'} if $feature{'GO_BP'};
$feature{'GO_Entrez'} = $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','. $feature{'GO_CC'} : $feature{'GO_CC'} if $feature{'GO_CC'};
$feature{'GO_Entrez'} = $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','. $feature{'GO_MF'} : $feature{'GO_MF'} if $feature{'GO_MF'};
#export
$feature{'EntrezGene'}='yes';
printf("%d:%s\n", $p+$n, $transcript_id);
foreach my $k(keys %feature){
sub_basic::refresh_log($out_dir.$transcript_id, $k, $feature{$k});
printf("\t%s: %s\n", $k, $feature{$k});
}
$n++;
}
};
printf ("No %s found.\n", $transcript_id) if $@; #return error when query;
}#2
}

No comments:

Post a Comment