Abstract: hunt CpG islands in a genome sequence
CpG islands is defined as a CpG-sites riched sequence where the obs/exp values is >0.65, and GC content is >55. Here, the window size is usually 500bp. Many CpG islands in mammals are often around the start of genes or promoters.
The results:
1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
length of DNA sequence: 248956422
Number of CpG sites: 2375159 [10470, 10483, 10488, 10492, 10496, 10524, 10541, 10562, 10570]
hunt CpG islands:
Number of CpG islands: 46282
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 20:57:50 2015
@author: yuan
"""
import itertools
import re
class methylations:
def __init__(self, in_file):
self.in_file=in_file
def read_fasta(self):
ref={}
acc=''
seq=[]
in_obj=open(self.in_file, 'r')
for line in in_obj:
line=line.rstrip('\n')
#first line
if line.find(">")==0:
#new sequence
if line.find('|')>0:
items=line.split('|')
acc=items[1]
else:
acc=re.sub("^>", "", line)
ref[acc]=[]
print acc
else:
#seq line
ref[acc].append(line)
in_obj.close()
#combine strings
for key in ref.keys():
ref[key]=''.join(ref[key])
return ref
def GC_content(self, DNA_seq, digits=2):
base_len=float(len(DNA_seq))
G_num=DNA_seq.count('G')
C_num=DNA_seq.count('C')
#GC content
GC_ratio=(G_num+C_num)*100/base_len
GC_content=round(GC_ratio, digits)
return GC_content
def CpG_value(self, DNA_seq, digits=2):
base_len=float(len(DNA_seq))
CG_num=DNA_seq.count('CG')
#observed numbers of CpG sites
obs_value=CG_num*base_len
G_num=DNA_seq.count('G')
C_num=DNA_seq.count('C')
#expected numbers of CpG sites
exp_value=G_num*C_num
CpG_value=0 if exp_value==0 else round(obs_value*100/exp_value, digits)
#print CpG_value
return CpG_value
def hunt_CpG_sites(self, DNA_seq):
print 'length of DNA sequence:', len(DNA_seq)
CpG_sites=[]
if DNA_seq.find("CG")>0:
CpG_sites=[m.start() for m in re.finditer('CG', DNA_seq)]
print 'Number of CpG sites:', len(CpG_sites),CpG_sites[1:10]
return CpG_sites
def CpG_islands(self, window_size=500, GC_cutoff=55, CpG_cutoff=65):
#get sequence with a given chromosome
ref=self.read_fasta()
DNA_seq=ref.values()[0]
#print len(DNA_seq)
#hunt genomic positions of CpG sites
CpG_sites=self.hunt_CpG_sites(DNA_seq)
#
print 'hunt CpG islands:'
start=0
end=1
n=0
CpG_islands=[]
while end < len(CpG_sites):
win_seq_len=CpG_sites[end]-CpG_sites[start]
if win_seq_len < window_size:
end += 1
else:
win_seq=DNA_seq[start:end]
GC_content=self.GC_content(win_seq)
if GC_content<GC_cutoff:
start += 1
else:
CpG_value=self.CpG_value(win_seq)
#print CpG_value
if CpG_value < CpG_cutoff:
start +=1
else:
CpG_info=(start+1, end+1, win_seq_len, GC_content, CpG_value)
CpG_islands.append(CpG_info)
#print CpG_info
start=end+1
end=start+1
n +=1
print 'Number of CpG islands: ', n
return CpG_islands
if __name__=="__main__":
#
in_file='/home/yuan/mysql_pre/cookbooks/Homo_sapiens.GRCh38.dna.chromosome.1.fa'
m=methylations(in_file)
m.CpG_islands()
print 'ok'
No comments:
Post a Comment