Abstract:
Retrieve DNA/protein sequence and their annotations from the
databases:GenBank, Swissprot, and EntrezGene.
Here
are the three Perl functions used for retrieving annotations from
GenBank, Swissprot and EntrezGene using BioPerl.
use
Bio::SeqIO;
use
Bio::DB::GenBank;
use
Bio::DB::Query::GenBank;
use
Bio::DB::SwissProt;
use
Bio::DB::EntrezGene;
####################
sub
GenBank_by_acc{#1
my
($p, $accs_pointer, $out_dir)=@_;
my
@accs=@$accs_pointer;
my
$n=0;
#GenBank
query;
my
$db_obj = Bio::DB::GenBank->new;
foreach
my $acc(@accs){#2
$n++;
my
$query_file=$out_dir.$acc;
if
(-f $query_file){
#printf(
"\tskip %d: %s\n", $p+$n, $acc);
}
else{
my
%rna_info;
my
$seq = $db_obj->get_Seq_by_id($acc);
$rna_info{description}=$seq->desc;
$rna_info{rna_len}=$seq->length;
foreach
my $feat ( $seq->get_SeqFeatures() ) { #3 features circyling
my
%feature;
my
$tag_name=$feat->primary_tag();
$feature{name}=$tag_name;
#get
feature's info
foreach
my $tag ( $feat->get_all_tags() ) { #4
my
$tag_value=join("", $feat->get_tag_values($tag));
if($tag=~/db_xref/){
my($a,
$b)=split(":", $tag_value);
$feature{$a}=$b;
}
else{ $feature{$tag}=$tag_value; }
}#4
#extract
source_features
if
( $tag_name=~/source/ ) { #4
$rna_info{chromosome}=exists
$feature{chromosome} ? $feature{chromosome}: 'NA';
$rna_info{map}=exists
$feature{map} ? $feature{map} : 'NA';
$rna_info{organism}=exists
$feature{organism} ? $feature{organism} : 'NA';
$rna_info{mol_type}=$feature{mol_type}
if exists $feature{mol_type};
}#4
elsif
( $tag_name=~/gene/ ) { #4
$rna_info{gene}=
exists $feature{gene} ? $feature{gene} : 'NA';
$rna_info{GeneID}=
exists $feature{GeneID} ? $feature{GeneID} : 'NA';
$rna_info{gene_synonym}=exists
$feature{gene_synonym} ? $feature{gene_synonym} : 'NA';
$rna_info{note}=exists
$feature{note} ? $feature{note} : 'NA';
}#4
elsif
( $tag_name=~/CDS/ ) { #4
$rna_info{GI}=
exists $feature{GI} ? $feature{GI} : 'NA';
$rna_info{product}=
exists $feature{product} ? $feature{product} : 'NA';
$rna_info{protein_id}=
exists $feature{protein_id} ? $feature{protein_id} : 'NA';
$rna_info{EC_number}=
exists $feature{EC_number} ? $feature{EC_number} : 'NA';
}
}#3
#export
sub_common::hash_to_file(\%rna_info,
$query_file, '=');
print
$p+$n, ":$acc\n";
}
}#2
}
######
sub
SwissProt_by_acc{
my
($p, $transcript_ids_pointer, $uniprot_accs_hash_pointer,
$out_dir)=@_;
my
@transcript_ids=@$transcript_ids_pointer;
my
%uniprot_accs_hash=%$uniprot_accs_hash_pointer;
my
$n=0;
#
my
@feature_tags=('KEGG','GO','InterPro');
my$sp_obj
= Bio::DB::SwissProt->new();
foreach
my $transcript_id(@transcript_ids){#2
my
%pro_info;
my
$uniprot_acc_str=$uniprot_accs_hash{$transcript_id};
my
@accs=split(',', $uniprot_acc_str);
foreach
my $acc(@accs){#3
eval{
my
$seq_obj;
if
($seq_obj=$sp_obj->get_Seq_by_acc($acc) ){
$pro_info{pro_len}=$seq_obj->length;
#get
annotations
my
$anno_collection=$seq_obj->annotation;
foreach
my $key ( $anno_collection->get_all_annotation_keys ) { #4
features circyling
#print
"$key\n";
my
@annotations=$anno_collection->get_Annotations($key);
#print
"@annotations\n";
for
my $value(@annotations){#5
#printf("\t%s###%s\n",
$value->tagname, $value->display_text);
if
($value->tagname eq 'dblink'){#6
my@a=split(":",
$value->display_text);
my
$f_tag=shift @a;
my
$f_value=join(':', @a);
#printf("\t%s###%s\t%s\n",
$value->tagname, $f_tag, $f_value);
$pro_info{$f_tag}=
$pro_info{$f_tag} ? $pro_info{$f_tag}.','.$f_value : $f_value;
}#6
}#5
}#4
}
};
printf("No
%s found\n", $acc) if $@; #return error when query failure
}#3
#export
foreach
my $tag_name(@feature_tags){#3
my
$tag_value='NA';
if
($pro_info{$tag_name}){
my
@values=split(',', $pro_info{$tag_name});
@values=List::MoreUtils::uniq
@values;
$tag_value=join(',',
@values);
}
sub_basic::refresh_log($out_dir.$transcript_id,
$tag_name, $tag_value);
printf("%d:%s\t%s\n",
$p+$n, $transcript_id, $tag_value);
}#3
$n++;
}#2
}
##########
sub
EntrezGene_by_acc{
my
($p, $transcript_ids_pointer, $out_dir)=@_;
my
@transcript_ids=@$transcript_ids_pointer;
my
$n=0;
#
my$query_obj
= Bio::DB::EntrezGene->new();
foreach
my $transcript_id(@transcript_ids){#2
eval{
my
$seq_obj;
if
($seq_obj=$query_obj->get_Seq_by_acc($transcript_id) ){
my
%feature;
#get
annotations
my
$anno_collection=$seq_obj->annotation;
foreach
my $key ( $anno_collection->get_all_annotation_keys ) { #4
features circyling
#print
"$key\n";
my
@annotations=$anno_collection->get_Annotations($key);
#print
"@annotations\n";
for
my $value(@annotations){#5
#printf("\t%s###%s\n",
$value->tagname, $value->display_text);
if
($value->tagname eq 'Function'){#6
my
$GO_term='GO:'.$value->display_text;
my
$GO_cat='GO_MF';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
($value->tagname eq 'Process'){#6
my
$GO_term='GO:'.$value->display_text;
my
$GO_cat='GO_BP';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
($value->tagname eq 'Component'){#6
my
$GO_term='GO:'.$value->display_text;
my
$GO_cat='GO_CC';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
(List::Util::first {$value->tagname eq $_} ('MIM', 'HPRD', 'HGNC',
'Exon count', 'Ensembl') ){#6
$feature{$value->tagname}=$value->display_text;
}#6
}#5
}#4
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_BP'} : $feature{'GO_BP'} if $feature{'GO_BP'};
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_CC'} : $feature{'GO_CC'} if $feature{'GO_CC'};
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_MF'} : $feature{'GO_MF'} if $feature{'GO_MF'};
#export
$feature{'EntrezGene'}='yes';
printf("%d:%s\n",
$p+$n, $transcript_id);
foreach
my $k(keys %feature){
sub_basic::refresh_log($out_dir.$transcript_id,
$k, $feature{$k});
printf("\t%s:
%s\n", $k, $feature{$k});
}
$n++;
}
};
printf
("No %s found.\n", $transcript_id) if $@; #return error
when query;
}#2
}
No comments:
Post a Comment