Abstract:
Here I showed how to achieve annotations of genes or proteins from
GenBank, EntrezGene, and SwissProt.
1. From GenBank
The code:
sub
GenBank_query{#1
my
($accession, $outfile)=@_;
my
%info=('accession'=>$accession);
#GenBank
query;
eval{
my
$db_obj = Bio::DB::GenBank->new;
my
$seq = $db_obj->get_Seq_by_id($accession);
$info{description}=$seq->desc;
$info{seq_len}=$seq->length;
$info{seq}=$seq->seq;
$info{taxon_specie}=$seq->species->taxon->scientific_name;
$info{taxonid_specie}=$seq->species->taxon->id;
eval{ $info{taxon_genus}=
taxon_id($info{taxonid_specie}, 'genus'); };
eval{ $info{taxon_subfamily}=
taxon_id($info{taxonid_specie}, 'subfamily'); };
eval{ $info{taxon_family}=
taxon_id($info{taxonid_specie}, 'family'); };
foreach
my $feat ( $seq->get_SeqFeatures() ) { #3 features circyling
my
%feature;
my
$tag_name=$feat->primary_tag();
$feature{name}=$tag_name;
#get
feature's info
foreach
my $tag ( $feat->get_all_tags() ) { #4
my
$tag_value=join("", $feat->get_tag_values($tag));
if($tag=~/db_xref/){
my($a,
$b)=split(":", $tag_value);
$feature{$a}=$b;
}
else{ $feature{$tag}=$tag_value; }
}#4
#print
$tag_name, "\n";
#print
sub_data::print_hash(\%feature);
#extract
source_features
if
( $tag_name=~/source/ ) { #4
$info{chromosome}=$feature{chromosome}
if exists $feature{chromosome};
$info{map}=$feature{map}
if exists $feature{map};
$info{organism}=$feature{organism}
if exists $feature{organism};
#$info{taxon_id}=$feature{taxon}
if exists $feature{taxon};
$info{mol_type}=$feature{mol_type}
if exists $feature{mol_type};
}#4
elsif
( $tag_name=~/gene/ ) { #4
$info{gene}=
$feature{gene} if $feature{gene};
$info{GeneID}=
$feature{GeneID} if $feature{GeneID};
$info{gene_synonym}=
$feature{gene_synonym} if $feature{gene_synonym};
$info{note}=
$feature{note} if $feature{note};
}#4
elsif($tag_name=~/Protein/){
$info{pro_wt}=
$feature{calculated_mol_wt} if $feature{calculated_mol_wt};
$info{product}=
$feature{product} if $feature{product};
}
elsif
( $tag_name=~/CDS/ ) { #4
$info{GeneID}=
$feature{GeneID} if $feature{GeneID};
$info{GI}=
$feature{GI} if $feature{GI};
$info{product}=
$feature{product} if $feature{product};
$info{protein_id}=
$feature{protein_id} if $feature{protein_id};
$info{EC_number}=
$feature{EC_number} if $feature{EC_number};
}
}#3
};
printf
("No %s found.\n", $accession) if $@; #return error when
query;
#
#sub_data::print_hash(\%info);
sub_common::hash_to_file(\%info,
$outfile, '=') if $outfile;
return(\%info);
}
2. From
EntrezGene
The code:
sub
EntrezGene_query{
my
($transcript_id, $outfile)=@_;
my
%feature=('query'=>'no', 'accession'=>$transcript_id);
#
eval{#2
my$query_obj
= Bio::DB::EntrezGene->new();
my
$seq_obj=$query_obj->get_Seq_by_acc($transcript_id);
#general
annotations
$feature{desc}=$seq_obj->desc()
if $seq_obj->desc();
$feature{display_id}=$seq_obj->display_id()
if $seq_obj->display_id();
#$feature{acc}=$seq->accession_number;
$feature{display_name}=$seq->display_name
if $seq->display_name;
$feature{seq_len}=$seq->length
if $seq->length;
#get
annotations
my
$anno_collection=$seq_obj->annotation;
foreach
my $key ( $anno_collection->get_all_annotation_keys ) { #4
features circyling
#print
"$key\n";
my
@annotations=$anno_collection->get_Annotations($key);
#print
"@annotations\n";
for
my $value(@annotations){#5
my
$tagname=$value->tagname;
my
$tagtext=$value->display_text;
#printf("\t%s###%s\n",
$tagname, $tagtext);
if
($tagname eq 'Function'){#6
my
$GO_term='GO:'.$tagtext;
my
$GO_cat='GO_MF';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
($tagname eq 'Process'){#6
my
$GO_term='GO:'.$tagtext;
my
$GO_cat='GO_BP';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
($tagname eq 'Component'){#6
my
$GO_term='GO:'.$tagtext;
my
$GO_cat='GO_CC';
$feature{$GO_cat}=
$feature{$GO_cat} ? $feature{$GO_cat}.','.$GO_term : $GO_term;
}#6
elsif
(List::Util::first {$tagname eq $_} ('MIM', 'HPRD', 'HGNC', 'Exon
count', 'Ensembl', 'cyto', 'Official Symbol', 'UniGene', 'Exon
count') ){#6
$feature{$tagname}=$tagtext;
}#6
}#5
}#4
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_BP'} : $feature{'GO_BP'} if $feature{'GO_BP'};
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_CC'} : $feature{'GO_CC'} if $feature{'GO_CC'};
$feature{'GO_Entrez'}
= $feature{'GO_Entrez'} ? $feature{'GO_Entrez'}.','.
$feature{'GO_MF'} : $feature{'GO_MF'} if $feature{'GO_MF'};
$feature{'query'}='yes';
printf("Get
annotations of %s from EntrezGene\n", $transcript_id);
};#2
printf
("No %s found.\n", $transcript_id) if $@; #return error
when query;
#
#sub_data::print_hash(\%feature);
sub_common::hash_to_file(\%feature,
$outfile, '=') if $outfile;
return(\%feature);
}
3.
SwissProt/UniProt
The code:
sub
SwissProt_query{
my
($acc, $outfile)=@_;
my
%pro_info=('uniprot_acc'=>$acc);
#
eval{#3
my$sp_obj
= Bio::DB::SwissProt->new();
my
$seq_obj = $sp_obj->get_Seq_by_acc($acc);
$pro_info{uniprot_acc}=$seq_obj->accession_number;
$pro_info{display_id}=$seq_obj->display_id;
$pro_info{display_name}=$seq_obj->display_name;
$pro_info{pro_len}=$seq_obj->length;
$pro_info{description}=$seq_obj->desc;
#taxon
name
$pro_info{taxon_specie}=$seq_obj->species->species();
$pro_info{taxon_specie_taxonid}=$seq_obj->species->taxon->id;
$pro_info{taxon_scientific_name}=$seq_obj->species->taxon->scientific_name;
$pro_info{taxon_genus}=$seq_obj->species->genus();
eval{ $pro_info{taxon_genus}=
taxon_id($pro_info{taxon_specie_taxonid}, 'genus'); };
printf("No
taxonomic genus name of %s: %s from taxonomic tree\n",
$pro_info{taxon_specie_taxonid}, $pro_info{uniprot_acc}) if $@;
eval{ $pro_info{taxon_subfamily}=
taxon_id($pro_info{taxon_specie_taxonid}, 'subfamily'); };
printf("No
taxonomic subfamily name of %s: %s from taxonomic tree\n",
$pro_info{taxon_specie_taxonid}, $pro_info{uniprot_acc}) if $@;
eval{ $pro_info{taxon_family}=
taxon_id($pro_info{taxon_specie_taxonid}, 'family'); };
printf("No
taxonomic family name of %s: %s from taxonomic tree\n",
$pro_info{taxon_specie_taxonid}, $pro_info{uniprot_acc}) if $@;
#get
annotations
my
$anno_collection=$seq_obj->annotation;
foreach
my $key ( $anno_collection->get_all_annotation_keys ) { #4
features circyling
#print
"$key\n";
my
@annotations=$anno_collection->get_Annotations($key);
#print
"@annotations\n";
for
my $value(@annotations){#5
#printf("\t%s###%s\n",
$value->tagname, $value->display_text);
if
($value->tagname eq 'dblink'){#6
my@a=split(":",
$value->display_text);
my
$f_tag=shift @a;
my
$f_value=join(':', @a);
#printf("\t%s###%s\t%s\n",
$value->tagname, $f_tag, $f_value);
$pro_info{$f_tag}=
$pro_info{$f_tag} ? $pro_info{$f_tag}.','.$f_value : $f_value;
}#6
}#5
}#4
};#3
printf("No
annotations of %s found.\n",$acc) if $@; #return error when
query
#export
#sub_data::print_hash(\%pro_info);
#print
sub_common::hash_to_file(\%pro_info,
$outfile, '=') if $outfile;
return(\%pro_info);
}
#end
No comments:
Post a Comment