Saturday, August 8, 2015

Bioinformatics in Python: retrieve sequence from NCBI

Abstract: retrieve genome sequence from the ftp site of NCBI, and save them into a fasta file in order

The result:
get GIs of the newest version
Version: GRCh38p2
Save genome sequences into /home/yuan/mysql_pre/cookbooks/H_sapiens_GRCh38p2.fa
retrieve sequence:
chr: 1, GI: 568815597
chr: 2, GI: 568815596
chr: 3, GI: 568815595
chr: 4, GI: 568815594
chr: 5, GI: 568815593
chr: 6, GI: 568815592
chr: 7, GI: 568815591
chr: 8, GI: 568815590
chr: 9, GI: 568815589
chr: 10, GI: 568815588
chr: 11, GI: 568815587
chr: 12, GI: 568815586
chr: 13, GI: 568815585
chr: 14, GI: 568815584
chr: 15, GI: 568815583
chr: 16, GI: 568815582
chr: 17, GI: 568815581
chr: 18, GI: 568815580
chr: 19, GI: 568815579
chr: 20, GI: 568815578
chr: 21, GI: 568815577
chr: 22, GI: 568815576
chr: MT, GI: 251831106
chr: X, GI: 568815575
chr: Y, GI: 568815574
ok


The script:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug  8 10:53:12 2015

@author: yuan
"""
import urllib2
import re
from Bio import Entrez, SeqIO


class NCBI_genome:
    def __init__(self, specie):
        self.specie=specie
        self.NCBI_ftp='ftp://ftp.ncbi.nlm.nih.gov/genomes/'
   
    def get_GI(self):
        web_file=self.NCBI_ftp+self.specie+'/Assembled_chromosomes/chr_NC_gi'
        ##retieve a web file
        urlfile=urllib2.urlopen(web_file)
        html=urlfile.read()
        lines=html.split('\n')
       
        #get index
        header=lines.pop(0)
        header=header.split('\t')
        for index, item in enumerate(header):
            #print index,item
            if item=='#Chr':
                chr_index=index
            elif item=='gi':
                gi_index=index
            elif item=='Assembly and unit':
                ver_index=index
        #loop chr
        GI_dict={}
        for line in lines:
            if 'GRCh' in line:
                items=line.split('\t')
                GI_dict[items[chr_index]]=items[gi_index]
                #print items[chr_index], items[gi_index]
        #release version
        release_version=lines[0].split('\t')[ver_index]
        release_version=release_version.split()[0]
        release_version=re.sub(r'\.', '', release_version)
        return GI_dict, release_version
       
    def sort_list(self, mix_list):
        #mixed list is numeric and string mixed together
        sort_list=[]
        str_list=[]
        for element in mix_list:
            if element.isdigit():
                sort_list.append(int(element))
            else:
                str_list.append(element)
        #
        str_list.sort()
        sort_list.sort()
        sort_list.extend(str_list)
        sort_list=map(str, sort_list)
        #print sort_list
        return sort_list

    def NCBI_seq(self, GI, email="yuan@surgery.wisc.edu"):
        Entrez.email=email
        handle=Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=GI)
        seq_record = SeqIO.read(handle, "fasta")
        seq=str(seq_record.seq)
        handle.close()
        return seq
       
    def genome_fasta(self, out_dir):
        print 'get GIs of the newest version'
        GI_dict, release_version=self.get_GI()
        print 'Version:',release_version
        #order of chromosome
        chr_list=self.sort_list(GI_dict.keys())
        #visit NCBI site and retrieve genome sequences
        out_file=out_dir+self.specie+'_'+release_version+'.fa'
        print 'Save genome sequences into', out_file
        print 'retrieve sequence:'
        out_obj=open(out_file, 'w')
        for chr_name in chr_list:
            GI=GI_dict[chr_name]
            print 'chr: %s, GI: %s' % (chr_name, GI)
            seq=self.NCBI_seq(GI)
            out_obj.write('>'+chr_name+'\n'+seq+'\n')
        out_obj.close()
       

if __name__=="__main__":
    specie='H_sapiens'
    n=NCBI_genome(specie)
    out_dir='/home/yuan/mysql_pre/cookbooks/'
    n.genome_fasta(out_dir)
    print 'ok'

No comments:

Post a Comment