Monday, August 17, 2015

Bioinformatics in Python: CpG islands

Abstract: hunt CpG islands in a genome sequence

CpG islands is defined as a CpG-sites riched sequence where the obs/exp values is >0.65, and GC content is >55. Here, the window size is usually 500bp. Many CpG islands in mammals are often around the start of genes or promoters.

The results:
1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
length of DNA sequence: 248956422
Number of CpG sites: 2375159 [10470, 10483, 10488, 10492, 10496, 10524, 10541, 10562, 10570]
hunt CpG islands:
Number of CpG islands:  46282
ok

The script:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 20:57:50 2015

@author: yuan
"""
import itertools
import re

class methylations:
   
    def __init__(self, in_file):
        self.in_file=in_file
   
    def read_fasta(self):
        ref={}
        acc=''
        seq=[]
        in_obj=open(self.in_file, 'r')
        for line in in_obj:
            line=line.rstrip('\n')
            #first line
            if line.find(">")==0:
                #new sequence
                if line.find('|')>0:
                    items=line.split('|')
                    acc=items[1]
                else:
                    acc=re.sub("^>", "", line)
                ref[acc]=[]
                print acc
            else:
                #seq line
                ref[acc].append(line)
        in_obj.close()
        #combine strings
        for key in ref.keys():
            ref[key]=''.join(ref[key])
        return ref
       
    def GC_content(self, DNA_seq, digits=2):
        base_len=float(len(DNA_seq))
        G_num=DNA_seq.count('G')
        C_num=DNA_seq.count('C')
       
        #GC content
        GC_ratio=(G_num+C_num)*100/base_len
        GC_content=round(GC_ratio, digits)
        return GC_content

    def CpG_value(self, DNA_seq, digits=2):
        base_len=float(len(DNA_seq))
        CG_num=DNA_seq.count('CG')
        #observed numbers of CpG sites
        obs_value=CG_num*base_len
        G_num=DNA_seq.count('G')
        C_num=DNA_seq.count('C')
        #expected numbers of CpG sites
        exp_value=G_num*C_num
        CpG_value=0 if exp_value==0 else round(obs_value*100/exp_value, digits)
        #print CpG_value
        return CpG_value
       
    def hunt_CpG_sites(self, DNA_seq):
        print 'length of DNA sequence:', len(DNA_seq)
        CpG_sites=[]
        if DNA_seq.find("CG")>0:
            CpG_sites=[m.start() for m in re.finditer('CG', DNA_seq)]
        print 'Number of CpG sites:', len(CpG_sites),CpG_sites[1:10]
        return CpG_sites
           
       
    def CpG_islands(self, window_size=500, GC_cutoff=55, CpG_cutoff=65):
        #get sequence with a given chromosome
        ref=self.read_fasta()
        DNA_seq=ref.values()[0]
        #print len(DNA_seq)
       
        #hunt genomic positions of CpG sites
        CpG_sites=self.hunt_CpG_sites(DNA_seq)
       
        #
        print 'hunt CpG islands:'
        start=0
        end=1
        n=0
        CpG_islands=[]
        while end < len(CpG_sites):
            win_seq_len=CpG_sites[end]-CpG_sites[start]
            if win_seq_len < window_size:
                end += 1
            else:
                win_seq=DNA_seq[start:end]
                GC_content=self.GC_content(win_seq)
                if GC_content<GC_cutoff:
                    start += 1
                else:
                    CpG_value=self.CpG_value(win_seq)               
                    #print CpG_value
                    if CpG_value < CpG_cutoff:
                        start +=1
                    else:
                        CpG_info=(start+1, end+1, win_seq_len, GC_content, CpG_value)
                        CpG_islands.append(CpG_info)
                        #print CpG_info
                        start=end+1
                        end=start+1
                        n +=1
        print 'Number of CpG islands: ', n
        return CpG_islands

if __name__=="__main__":
    #
    in_file='/home/yuan/mysql_pre/cookbooks/Homo_sapiens.GRCh38.dna.chromosome.1.fa'
    m=methylations(in_file)
    m.CpG_islands()
    print 'ok'

No comments:

Post a Comment