Abstract: retrieve genome sequence from the ftp site of NCBI, and save them into a fasta file in order
The result:
get GIs of the newest version
Version: GRCh38p2
Save genome sequences into /home/yuan/mysql_pre/cookbooks/H_sapiens_GRCh38p2.fa
retrieve sequence:
chr: 1, GI: 568815597
chr: 2, GI: 568815596
chr: 3, GI: 568815595
chr: 4, GI: 568815594
chr: 5, GI: 568815593
chr: 6, GI: 568815592
chr: 7, GI: 568815591
chr: 8, GI: 568815590
chr: 9, GI: 568815589
chr: 10, GI: 568815588
chr: 11, GI: 568815587
chr: 12, GI: 568815586
chr: 13, GI: 568815585
chr: 14, GI: 568815584
chr: 15, GI: 568815583
chr: 16, GI: 568815582
chr: 17, GI: 568815581
chr: 18, GI: 568815580
chr: 19, GI: 568815579
chr: 20, GI: 568815578
chr: 21, GI: 568815577
chr: 22, GI: 568815576
chr: MT, GI: 251831106
chr: X, GI: 568815575
chr: Y, GI: 568815574
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 8 10:53:12 2015
@author: yuan
"""
import urllib2
import re
from Bio import Entrez, SeqIO
class NCBI_genome:
def __init__(self, specie):
self.specie=specie
self.NCBI_ftp='ftp://ftp.ncbi.nlm.nih.gov/genomes/'
def get_GI(self):
web_file=self.NCBI_ftp+self.specie+'/Assembled_chromosomes/chr_NC_gi'
##retieve a web file
urlfile=urllib2.urlopen(web_file)
html=urlfile.read()
lines=html.split('\n')
#get index
header=lines.pop(0)
header=header.split('\t')
for index, item in enumerate(header):
#print index,item
if item=='#Chr':
chr_index=index
elif item=='gi':
gi_index=index
elif item=='Assembly and unit':
ver_index=index
#loop chr
GI_dict={}
for line in lines:
if 'GRCh' in line:
items=line.split('\t')
GI_dict[items[chr_index]]=items[gi_index]
#print items[chr_index], items[gi_index]
#release version
release_version=lines[0].split('\t')[ver_index]
release_version=release_version.split()[0]
release_version=re.sub(r'\.', '', release_version)
return GI_dict, release_version
def sort_list(self, mix_list):
#mixed list is numeric and string mixed together
sort_list=[]
str_list=[]
for element in mix_list:
if element.isdigit():
sort_list.append(int(element))
else:
str_list.append(element)
#
str_list.sort()
sort_list.sort()
sort_list.extend(str_list)
sort_list=map(str, sort_list)
#print sort_list
return sort_list
def NCBI_seq(self, GI, email="yuan@surgery.wisc.edu"):
Entrez.email=email
handle=Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=GI)
seq_record = SeqIO.read(handle, "fasta")
seq=str(seq_record.seq)
handle.close()
return seq
def genome_fasta(self, out_dir):
print 'get GIs of the newest version'
GI_dict, release_version=self.get_GI()
print 'Version:',release_version
#order of chromosome
chr_list=self.sort_list(GI_dict.keys())
#visit NCBI site and retrieve genome sequences
out_file=out_dir+self.specie+'_'+release_version+'.fa'
print 'Save genome sequences into', out_file
print 'retrieve sequence:'
out_obj=open(out_file, 'w')
for chr_name in chr_list:
GI=GI_dict[chr_name]
print 'chr: %s, GI: %s' % (chr_name, GI)
seq=self.NCBI_seq(GI)
out_obj.write('>'+chr_name+'\n'+seq+'\n')
out_obj.close()
if __name__=="__main__":
specie='H_sapiens'
n=NCBI_genome(specie)
out_dir='/home/yuan/mysql_pre/cookbooks/'
n.genome_fasta(out_dir)
print 'ok'
No comments:
Post a Comment