Abstract:
Convert GBK format file into FASTA format file based on the annotated
features in GBK files.
GBK format file
GenBank is the NIH
genetic sequence database for annotated collection of all publicly
available DNA sequences (http://www.ncbi.nlm.nih.gov/genbank/).
Those annotated features can be stored into the GenBank flat file
format (GBK file format). Here is the GenBank record from human RNA
annotations in GBK file.
LOCUS NR_046018
1652 bp RNA linear PRI 11-JAN-2014
DEFINITION Homo sapiens
DEAD/H (Asp-Glu-Ala-Asp/His) box helicase 11 like 1
(DDX11L1), non-coding RNA.
ACCESSION NR_046018
XM_003403543
VERSION NR_046018.2 GI:389886562
KEYWORDS RefSeq.
SOURCE Homo sapiens (human)
ORGANISM Homo sapiens
Eukaryota; Metazoa; Chordata; Craniata; Vertebrata;
Euteleostomi;
Mammalia; Eutheria; Euarchontoglires; Primates;
Haplorrhini;
Catarrhini; Hominidae; Homo.
REFERENCE 1 (bases 1 to 1652)
AUTHORS Costa,V., Casamassimi,A., Roberto,R., Gianfrancesco,F.,
Matarazzo,M.R., D'Urso,M., D'Esposito,M., Rocchi,M. and
Ciccodicola,A.
TITLE DDX11L: a novel transcript family emerging from human
subtelomeric
regions
JOURNAL BMC Genomics 10, 250 (2009)
PUBMED 19476624
REMARK Publication Status: Online-Only
COMMENT VALIDATED REFSEQ: This record has undergone validation
or
preliminary review. The reference sequence was derived
from
AM992871.1.
On Jun 5, 2012 this sequence version replaced
gi:372622384.
##Evidence-Data-START##
Transcript exon combination :: AM992871.1, BM920887.1
[ECO:0000332]
RNAseq introns :: single sample supports all
introns
ERS025091, ERS025094
[ECO:0000348]
##Evidence-Data-END##
PRIMARY REFSEQ_SPAN PRIMARY_IDENTIFIER PRIMARY_SPAN
COMP
1-1652 AM992871.1 1-1652
FEATURES
Location/Qualifiers
source
1..1652
/organism="Homo sapiens"
/mol_type="transcribed RNA"
/db_xref="taxon:9606"
/chromosome="1"
/map="1p36.33"
gene
1..1652
/gene="DDX11L1"
/note="DEAD/H (Asp-Glu-Ala-Asp/His) box
helicase 11 like
1"
/pseudo
/db_xref="GeneID:100287102"
/db_xref="HGNC:37102"
misc_RNA
1..1652
/gene="DDX11L1"
/product="DEAD/H (Asp-Glu-Ala-Asp/His) box
helicase 11
like 1"
/pseudo
/db_xref="GeneID:100287102"
/db_xref="HGNC:37102"
ORIGIN
1 cttgccgtca gccttttctt tgacctcttc tttctgttca tgtgtatttg
ctgtctctta
61 gcccagactt cccgtgtcct ttccaccggg cctttgagag gtcacagggt
cttgatgctg
121 tggtcttcat ctgcaggtgt ctgacttcca gcaactgctg gcctgtgcca
gggtgcaagc
181 tgagcactgg agtggagttt tcctgtggag aggagccatg cctagagtgg
gatgggccat
241 tgttcatctt ctggcccctg ttgtctgcat gtaacttaat accacaacca
ggcatagggg
301 aaagattgga ggaaagatga gtgagagcat caacttctct cacaacctag
gccagtgtgt
361 ggtgatgcca ggcatgccct tccccagcat caggtctcca gagctgcaga
agacgacggc
421 cgacttggat cacactcttg tgagtgtccc cagtgttgca gaggcagggc
catcaggcac
481 caaagggatt ctgccagcat agtgctcctg gaccagtgat acacccggca
ccctgtcctg
541 gacacgctgt tggcctggat ctgagccctg gtggaggtca aagccacctt
tggttctgcc
601 attgctgctg tgtggaagtt cactcctgcc ttttcctttc cctagagcct
ccaccacccc
661 gagatcacat ttctcactgc cttttgtctg cccagtttca ccagaagtag
gcctcttcct
721 gacaggcagc tgcaccactg cctggcgctg tgcccttcct ttgctctgcc
cgctggagac
781 ggtgtttgtc atgggcctgg tctgcaggga tcctgctaca aaggtgaaac
ccaggagagt
841 gtggagtcca gagtgttgcc aggacccagg cacaggcatt agtgcccgtt
ggagaaaaca
901 ggggaatccc gaagaaatgg tgggtcctgg ccatccgtga gatcttccca
gggcagctcc
961 cctctgtgga atccaatctg tcttccatcc tgcgtggccg agggccaggc
ttctcactgg
1021 gcctctgcag gaggctgcca tttgtcctgc ccaccttctt agaagcgaga
cggagcagac
1081 ccatctgcta ctgccctttc tataataact aaagttagct gccctggact
attcaccccc
1141 tagtctcaat ttaagaagat ccccatggcc acagggcccc tgcctggggg
cttgtcacct
1201 cccccacctt cttcctgagt cattcctgca gccttgctcc ctaacctgcc
ccacagcctt
1261 gcctggattt ctatctccct ggcttggtgc cagttcctcc aagtcgatgg
cacctccctc
1321 cctctcaacc acttgagcaa actccaagac atcttctacc ccaacaccag
caattgtgcc
1381 aagggccatt aggctctcag catgactatt tttagagacc ccgtgtctgt
cactgaaacc
1441 ttttttgtgg gagactattc ctcccatctg caacagctgc ccctgctgac
tgcccttctc
1501 tcctccctct catcccagag aaacaggtca gctgggagct tctgccccca
ctgcctaggg
1561 accaacaggg gcaggaggca gtcactgacc ccgagacgtt tgcatcctgc
acagctagag
1621 atcctttatt aaaagcacac tgttggtttc tg
//
The definitions of
the GBK format are present at
http://www.ncbi.nlm.nih.gov/genbank/samplerecord/.
There are some key points we should know:
1. How to index the
annotated sequences? The tags known as LOCUS or ACCESSION can be used
for index of the sequence.
2. What is the
features?. The features are listed under the tag FEATURE. There is
always a tag known as SOURCE consistent with the whole sequence by
this record. The other tags differ from the DNA/RNA/Protein records.
In other words, those features are the sub-units of the top
annotation of this GBK record.
3. Where is the
sequences? The sequences is under the tag known as ORIGIN.
Here is the Perl
script for converting the sequences of all the features into FASTA
format files. This script would generate multiple files named as the
names of features in a given directory.
#!/usr/bin/perl
use strict;
use warnings;
##########################
#extract sequences from GBK file
sub rna_gbk{#1
my ($variables_pointer)=@_;
my %variables=%$variables_pointer;
$variables{result_dir}=$variables{result_dir}.'RNA/';
mkdir($variables{result_dir}, 0755) unless -d
$variables{result_dir};
my %rna_feature;
open my ($LOG), ">",
$variables{result_dir}.$variables{specie}.'_RNA.log' or die;
print $LOG "\nExtract RNA information from
$variables{genome_dir}.$variables{rna_gbk}:\n\n";
open my ($RNA_txt), ">",
$variables{result_dir}.$variables{specie}.'_RNA_info.txt' or die;
my $in = Bio::SeqIO->new(-file => $variables{rna_gbk},
-format => 'genbank');
while ( my $seq = $in->next_seq() ) { #2 sequence circyling
my %rna_info=(display_id=>"NA", accession=>"NA",
description=>"NA", rna_seq =>"NA",
rna_len =>0, chromosome =>"NA",
rna_map =>"NA", mol_type =>"NA", );
$rna_info{display_id}=$seq->display_id();
$rna_info{GI}=$seq->primary_id();
$rna_info{accession}=$seq->accession();
$rna_info{description}=$seq->desc();
$rna_info{rna_seq}=$seq->seq();
$rna_info{rna_len}=$seq->length();
#print "$rna_info{accession}\n";
foreach my $feat ( $seq->get_SeqFeatures() ) { #3 features
circyling
my %feature;
my $tag_name=$feat->primary_tag();
$feature{name}=$tag_name;
$feature{start}=$feat->start;
$feature{end}=$feat->end;
$feature{strand}=$feat->strand;
$feature{seq}=substr($rna_info{rna_seq}, $feature{start}-1,
$feature{end}-$feature{start}+1);
$feature{gene}='NA';
#get feature's info
foreach my $tag ( $feat->get_all_tags() ) { #4
my $tag_value=join("",
$feat->get_tag_values($tag));
if($tag=~/db_xref/){
my($a, $b)=split(":", $tag_value);
$feature{$a}=$b;
}
else{
$feature{$tag}=$tag_value;
}
}#4
#get pseudo-genes
if ($tag_name=~/misc_RNA/ and exists $feature{pseudo}){
$feature{name}='pseudo';
}
#extract source_features
if ( $tag_name=~/source/ ) { #4
$rna_info{chromosome}=$feature{chromosome} if exists
$feature{chromosome};
$rna_info{rna_map}=$feature{map} if exists $feature{map};
$rna_info{mol_type}=$feature{mol_type} if exists
$feature{mol_type};
#rna sequences information and source features
print $RNA_txt join("\t", $rna_info{accession},
$rna_info{display_id}, $rna_info{description},
$rna_info{rna_len},
$rna_info{chromosome}, $rna_info{rna_map},
$rna_info{mol_type},
$rna_info{rna_seq}), "\n";
#change feature name
$feature{mol_type}=~s/\s/_/g;
$feature{name}=$feature{mol_type};
}#4
elsif ($tag_name=~/ncRNA/) {#4
$feature{name}=$feature{ncRNA_class};
}#4
my
$feature_index=$rna_info{accession}.':'.$rna_info{rna_len}.':'.$feature{gene}.':'.$feature{start}.'_'.$feature{end};
$rna_feature{$feature{name}}->{$feature_index}=$feature{seq};
if ($tag_name=~/CDS/){#4
#5'UTR
if($feature{start}>10){
my $UTR5_start=1;
my $UTR5_end=$feature{start}-1;
my $UTR5_seq=substr($rna_info{rna_seq}, $UTR5_start-1,
$UTR5_end-$UTR5_start+1);
my
$feature_index=$rna_info{accession}.':'.$rna_info{rna_len}.':'.$feature{gene}.':'.$UTR5_start.'_'.$UTR5_end;
$rna_feature{'5UTR'}->{$feature_index}=$UTR5_seq;
}
#3'UTR
if ($rna_info{rna_len}-$feature{end}>10){
my $UTR3_start=$feature{end}+1;
my $UTR3_end=$rna_info{rna_len};
my $UTR3_seq=substr($rna_info{rna_seq}, $UTR3_start-1,
$UTR3_end-$UTR3_start+1);
my
$feature_index=$rna_info{accession}.':'.$rna_info{rna_len}.':'.$feature{gene}.':'.$UTR3_start.'_'.$UTR3_end;
$rna_feature{'3UTR'}->{$feature_index}=$UTR3_seq;
}
}#4
}#3 features circyling
} #2 sequence circyling
print "export sequences of features\n";
foreach my $feature_name(keys %rna_feature){
print "$feature_name\n";
my $pointer=$rna_feature{$feature_name};
my %hash=%$pointer;
open my $OUT, ">",
$variables{result_dir}.$variables{specie}.'_RNA_'.$feature_name.'.fa'
or die;
foreach my $feature_index(keys %hash){
print $OUT ">$feature_name:$feature_index\n",
"$hash{$feature_index}\n";
}
close($OUT);
my $num=keys %hash;
print $LOG "$feature_name=$num\n";
}
close($LOG);
}#1
######################################################
#main program
#H. sapiens genome data extraction
my %variables=(
'genome_dir'=>"/home/yuan/data_preparation/ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/",
'result_dir'=>"/home/yuan/mysql_pre/human/",
'specie'=>"human",
'rna_gbk'=>"/home/yuan/data_preparation/ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/RNA/rna.gbk",
);
# extract rna information from rna.gbk
rna_gbk(\%variables);
print "OK\n";
Writing date:
2012.12.01, 2015.02.16
No comments:
Post a Comment