Saturday, August 1, 2015
bioinformatics in python: enzyme sites of genome sequences
Abstract: get all positions of a given enzyme sites
The enzyme sites can stored into list or bitmap.
chr list bitmap
1 584661 7779497
10 330135 4180795
11 330070 4221143
12 332355 4164525
13 225415 3573508
14 221532 3340098
15 212185 3186869
16 217813 2819636
17 225530 2601465
18 180484 2508186
19 174601 1831434
2 579526 7568172
20 160422 2010443
21 91397 1459306
22 102349 1587692
3 473048 6194839
4 440011 5943835
5 431296 5671196
6 409943 5335812
7 390396 4979244
8 349824 4533704
9 297447 4322947
MT 23 488
X 377705 4875903
Y 52949 1777741
Memory usage of dictionary and bitmap (byte): 419
Memory usage of dictionary and list combined (byte): 419
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 31 10:58:36 2015
@author: yuan
"""
import math
import sys
import re
import itertools
###############
#dictionary and bitmap combined
#bitmap used for storing all enzyme sites of each chromosome
#
class bitmap:
def __init__(self, arr, bitmap_len=1, bitmap=False):
self.num=len(arr)
self.arr=arr
self.bitmap_len=bitmap_len
if self.num==0:
self.bitmap=[0]
else:
if bitmap is False:
self.list_to_bitmap()
else:
self.bitmap=self.arr
self.bitmap_len=len(self.bitmap)
def list_to_bitmap(self):
#list to bitmap
arr_bitmap_len=int(math.ceil((max(self.arr)+1)/32.))
if arr_bitmap_len >= self.bitmap_len:
self.bitmap_len = arr_bitmap_len
self.bitmap=[0]*self.bitmap_len
for v in self.arr:
divide, remainder = divmod(v, 32)
if self.test_element(v)==0:
self.bitmap[divide]=(2**remainder)+self.bitmap[divide]
else:
print v, 'is duplicated!'
def test_element(self, number):
pool_remainder=0
divide, remainder = divmod(number, 32)
if divide<self.bitmap_len and number>=0 and self.bitmap[divide]>0:
#bitstring in bitmap
bit_str=format(self.bitmap[divide],'b')
str_len=len(bit_str)
if str_len > remainder:
pool_remainder=int(bit_str[(str_len-remainder-1):(str_len-remainder)])
return pool_remainder
def to_bitmap(self):
return self.bitmap
def min_value(self):
min_value=0
for i in range(self.bitmap_len):
if self.bitmap[i]>0:
bit_str=format(self.bitmap[i], 'b')
#print i,bit_str
min_value=i*32+(len(bit_str)-bit_str.rfind('1')-1)
break
return min_value
def max_value(self):
max_value=0
for i in range(self.bitmap_len)[::-1]:
if self.bitmap[i]>0:
bit_str=format(self.bitmap[i], 'b')
#print bit_str
#
max_value=i*32+len(bit_str)-1
break
return max_value
def to_list(self, bitmap_index):
mylist=[]
if bitmap_index<self.bitmap_len:
bitmap_value=self.bitmap[bitmap_index]
if bitmap_value>0:
bit_str=format(bitmap_value, 'b')[::-1]
#print bit_str
for index, element in enumerate(bit_str):
if int(element)==1:
a=bitmap_index*32+index
mylist.append(a)
#print '=', bitmap_index,bitmap_value, bit_str, mylist
return mylist
def range_bitmap(self, start_value=None, end_value=None):
if start_value is None:
start_value=self.min_value()
if end_value<start_value or end_value is None:
end_value=self.max_value()
#
mylist=[]
divide1, remainder1=divmod(start_value, 32)
divide2, remainder2=divmod(end_value, 32)
#loop
for i in range(divide1, (divide2+1) ):
part_list=self.to_list(i)
if len(part_list)>0:
if divide1 < i < divide2:
mylist.extend(part_list)
else:
for value in part_list:
if start_value<= value <=end_value:
mylist.append(value)
return mylist
def add_element(self, number):
divide, remainder=divmod(number, 32)
if divide>=len(self.bitmap):
bitmap_add=[0]*(divide-len(self.bitmap)+1)
#print bitmap_add
self.bitmap.extend(bitmap_add)
self.bitmap_len=len(self.bitmap)
#add a number
self.bitmap[divide] +=(2**remainder)
return self.bitmap
def remove_element(self, number):
divide, remainder=divmod(number, 32)
if self.test_element(number):
self.bitmap[divide]=self.bitmap[divide]-(2**remainder)
return self.bitmap
##################
class genome_info(bitmap):
def __init__(self, fa_file):
self.fa_file=fa_file
#Func is reference of functions
def read_fasta(self, Func=None):
#store accession: sequences
ref={}
with open(self.fa_file, 'r') as in_obj:
for line1,line2 in itertools.izip_longest(*[in_obj]*2):
#first line
line1=line1.rstrip('\n')
#accession number
acc=0
if line1.find('|')>0:
items=line1.split('|')
acc=items[1]
else:
acc=re.sub("^>", "", line1)
print acc
#second line: sequence
line2=line2.rstrip('\n')
ref[acc]=line2 if Func is None else Func(line2)
in_obj.close()
return ref
def motif_bitmap(self, seq):
#read enzyme sites into list
motif_list=[m.start() for m in re.finditer(self.enzyme, seq)]
#transfer list to bitmap
motif_bitmap=bitmap(motif_list).to_bitmap()
#print motif_bitmap
#print seq
return motif_bitmap
def enzyme_sites(self, enzyme):
self.enzyme=enzyme.upper()
#
genome_enzyme_sites=self.read_fasta(self.motif_bitmap)
#
ES_dict={}
for key in sorted(genome_enzyme_sites.keys() ):
ES_bitmap=genome_enzyme_sites[key]
b=bitmap(ES_bitmap, bitmap=True)
ES_dict[key]=b.range_bitmap()
print key, len(ES_dict[key]), len(ES_bitmap)
#print ES_bitmap
return genome_enzyme_sites, ES_dict
if __name__=="__main__":
fa_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.fa'
#fa_file='/home/yuan/mysql_pre/cookbooks/test.fa'
#my code
g=genome_info(fa_file)
ES, ES_dict=g.enzyme_sites('GATC') #DpnI
print 'Memory usage of dictionary and bitmap (byte):', sys.getsizeof(ES)/8
print 'Memory usage of dictionary and list combined (byte):', sys.getsizeof(ES_dict)/8
print 'ok'
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment