Wednesday, August 5, 2015

Bioinformatics in python: splice DNA

Abstract: get exons and introns

The splicing rule:
    5′ splice site 5′-AG↓GTAAGT-3′
    3′ splice site 5′-PyPyPyPyPyPyNCAG↓-3′

The result:
126
intron: [(25, 32), (40, 44), (86, 114)]
exon: [(1, 24), (33, 39), (45, 85), (115, 126)]
cDNA: ATCGATCGATCGATCGACTGACTTATGCATACTCGATCGATCGATCGATCGATCGATCGATCGATCGATATCGACTACTA
exon seq: ['ATCGATCGATCGATCGACTGACT', 'TATGCA', 'TACTCGATCGATCGATCGATCGATCGATCGATCGATCGAT', 'ATCGACTACTA']
intron seq: ['GTCATAG', 'GTAG', 'GTATGCTATCATCGATCGATATCGATAG']
ok

The script:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  4 18:13:45 2015

@author: yuan
"""
#extract exon and intron

import re

#
class splicing:
    def __init__(self, seq):
        self.seq=seq
        self.seq_len=len(seq)
        print self.seq_len
        self.exon=[]
        self.intron=[]
       
    def hunt_intron(self):
        start=-1
        end=-1
        for i in range(self.seq_len-1):
            bibase=self.seq[i]+self.seq[i+1]
            if bibase=='GT':
                start=i
                #print 'start',start
            elif bibase=='AG' and start>0:
                end=i
                #print 'end', end
            if start>0 and end>0:
                intron=(start+1,end+2+1)
                self.intron.append(intron)
                #print start,end
                start=-1
                end=-1
        if self.intron==[]:
            self.intron=[(0,0)]
        return self.intron
       
    def hunt_exon(self):
        if self.intron==[]: # run hunt_intron() in advance
            self.hunt_intron
        elif self.intron==[(0,0)]: #no intron
            self.exon=[(1, self.seq_len)]
        #get exon
        if self.intron[0][0]>1:
            exon=(1, self.intron[0][0]-1)
            self.exon.append(exon)
        for i in range(len(self.intron)-1):
            exon_start=self.intron[i][1]+1
            exon_end=self.intron[i+1][0]-1
            exon=(exon_start,exon_end)
            self.exon.append(exon)
        if self.intron[-1][1]<self.seq_len:
            exon=(self.intron[-1][1]+1, self.seq_len)
            self.exon.append(exon)
        #
        return self.exon

    def cDNA_seq(self):
        if self.exon==[]:
            self.hunt_exon()
        #
        cDNA=''
        exons=[]
        for exon in self.exon:
            start_index=exon[0]-1
            end_index=exon[1]-1
            exon_seq=self.seq[start_index:end_index]
            cDNA +=exon_seq                       
            exons.append(exon_seq)
            #print exon_seq
        return cDNA, exons

    def intron_seq(self):
        introns=[]
        if self.intron==[] or self.intron==[(0,0)]:
            pass
        else:
            for intron in self.intron:
                start_index=intron[0]-1
                end_index=intron[1]-1
                introns.append(self.seq[start_index:end_index])
        return introns

if __name__=="__main__":
    seq='ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTATGCTATCATCGATCGATATCGATAGCATCGACTACTAT'
    s=splicing(seq)
    print 'intron:', s.hunt_intron()
    print 'exon:', s.hunt_exon()
    print 'cDNA:', s.cDNA_seq()[0]
    print 'exon seq:', s.cDNA_seq()[1]
    print 'intron seq:', s.intron_seq()
    print 'ok'

No comments:

Post a Comment