Friday, August 14, 2015
Bioinformatics in Python: Download genome sequences from Ensembl
Abstract: The script is used for download the newest version genome annotations.
DNA, cDNA, ncDNA, protein, GFF3/GTF from the public database Ensembl (ftp://ftp.ensembl.org/pub/). All files will be stored into a local disk and decompressed as well.
The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 17:54:43 2015
@author: yuan
"""
import re
import os
import urllib
import urllib2
import gzip
#download the newest genome sequences from Ensembl
#file opeartions
class file_directory:
def __init__(self, in_file):
self.in_file=in_file
def format_dir(self, directory):
if not directory[-1] == '/':
directory=directory+'/'
if not os.path.isdir(directory):
os.mkdir(directory, 0755)
return directory
def file_name(self):
#file directory
file_dir=self.in_file[:self.in_file.rindex('/')]+'/'
#file name
items=self.in_file.split("/")
file_name=items[-1]
if file_name.rfind(".")>0:
name_head=file_name[:file_name.rfind(".")]
name_tail=file_name[file_name.rfind("."):]
else:
name_head=file_name
name_tail=''
#print file_dir, file_name, name_head, name_tail
return file_dir, file_name, name_head, name_tail
def decompress_gz(self, out_dir=None):
inF = gzip.GzipFile(self.in_file, 'rb')
s = inF.read()
inF.close()
if out_dir is None:
out_file=self.file_name()[0]+self.file_name()[2]
else:
out_dir=self.format_dir(out_dir)
out_file=out_dir+self.file_name()[2]
outF = file(out_file, 'wb')
outF.write(s)
outF.close()
print 'Decompressed file:', out_file
return out_file
def compress_gz(self, out_dir=None):
inF = file(self.in_file, 'rb')
s = inF.read()
inF.close()
if out_dir is None:
out_file=self.in_file+'.gz'
else:
out_dir=self.format_dir(out_dir)
out_file=out_dir+self.file_name()[1]+'.gz'
outF = gzip.GzipFile(out_file, 'wb')
outF.write(s)
outF.close()
print 'Compressed file:', out_file
return out_file
def download_file(self, out_dir):
out_dir=self.format_dir(out_dir)
out_file=out_dir+self.file_name()[1]
#
web_obj=urllib.URLopener()
web_obj.retrieve(self.in_file, out_file)
print "Download file %s and save it as %s" % (self.in_file, out_file)
return out_file
#download reference genomes from Ensembl
class ensemble:
def __init__(self, specie, out_dir):
self.specie=specie
self.out_dir=out_dir
self.ensembl_url='ftp://ftp.ensembl.org/pub/'
self.ensembl_ver=self.newest_release(self.ensembl_url)
def get_html(self,url):
print 'Get html:', url
url_obj=urllib2.urlopen(url)
html=url_obj.read()
lines=html.split('\n')
return lines
def newest_release(self, url):
lines=self.get_html(url)
#print html
for line in lines:
if line.find('current')>0 and line.find('release')>0:
items=line.split()
last=items[-1]
ver=re.sub('release-','', last)
ver=ver[:ver.index('/')]
break
print 'Release:', ver
return ver
def print_dict(self, dictionary):
for key in dictionary.keys():
print key, dictionary[key]
def dna_files(self, lines):
chr_files={}
for line in lines:
if line.find('dna.chromosome')>0:
if not line.find('CHR_')>0:
items=line.split()
file_name=items[-1]
chromosome=re.sub(r'^.*chromosome\.|\.fa.gz', '', file_name)
chr_files[chromosome]=file_name
self.print_dict(chr_files)
return chr_files
def all_files(self, lines):
chr_files={}
for line in lines:
if line.find('.gz')>0 and line.find('abinitio')==-1:
items=line.split()
file_name=items[-1]
chr_files['all']=file_name
print file_name
return chr_files
def chr_files(self,genome_type):
self.genome_type=genome_type
if self.genome_type in ['dna', 'cdna', 'cds','pep','ncrna']:
self.file_type='fasta'
elif self.genome_type=='gff3':
self.file_type='gff3'
elif self.genome_type=='gtf':
self.file_type='gtf'
#get genome files
if self.genome_type in ['dna']:
#get html
url=self.ensembl_url+'current_'+self.file_type+'/'+self.specie+'/'+self.genome_type+'/'
lines=self.get_html(url)
#
chr_files=self.dna_files(lines)
elif self.genome_type in ['cdna', 'cds','pep', 'ncrna']:
#get html
url=self.ensembl_url+'current_'+self.file_type+'/'+self.specie+'/'+self.genome_type+'/'
lines=self.get_html(url)
#
chr_files=self.all_files(lines)
elif self.genome_type in ['gff3', 'gtf']:
#get html
url=self.ensembl_url+'current_'+self.file_type+'/'+self.specie+'/'
lines=self.get_html(url)
#get the list of files
chr_files=self.all_files(lines)
#download files
for key in chr_files.keys():
local_file=file_directory(url+chr_files[key]).download_file(self.out_dir)
#decompress file
file_directory(local_file).decompress_gz()
return chr_files
#main programming
if __name__=="__main__":
species={'human':'homo_sapiens', 'mouse':'mus_musculus','rat':'rattus_norvegicus'}
out_dir='/home/yuan/mysql_pre/cookbooks/'
e=ensemble(species['human'], out_dir)
e.chr_files('dna')
#e.chr_files('cdna')
#e.chr_files('cds')
#e.chr_files('pep')
#e.chr_files('ncrna')
#e.chr_files('gff3')
#e.chr_files('gtf')
print 'ok'
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment