Abstract: get exons and introns
The splicing rule:
5′ splice site 5′-AG↓GTAAGT-3′
3′ splice site 5′-PyPyPyPyPyPyNCAG↓-3′
The result:
126
intron: [(25, 32), (40, 44), (86, 114)]
exon: [(1, 24), (33, 39), (45, 85), (115, 126)]
cDNA: ATCGATCGATCGATCGACTGACTTATGCATACTCGATCGATCGATCGATCGATCGATCGATCGATCGATATCGACTACTA
exon seq: ['ATCGATCGATCGATCGACTGACT', 'TATGCA', 'TACTCGATCGATCGATCGATCGATCGATCGATCGATCGAT', 'ATCGACTACTA']
intron seq: ['GTCATAG', 'GTAG', 'GTATGCTATCATCGATCGATATCGATAG']
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 4 18:13:45 2015
@author: yuan
"""
#extract exon and intron
import re
#
class splicing:
def __init__(self, seq):
self.seq=seq
self.seq_len=len(seq)
print self.seq_len
self.exon=[]
self.intron=[]
def hunt_intron(self):
start=-1
end=-1
for i in range(self.seq_len-1):
bibase=self.seq[i]+self.seq[i+1]
if bibase=='GT':
start=i
#print 'start',start
elif bibase=='AG' and start>0:
end=i
#print 'end', end
if start>0 and end>0:
intron=(start+1,end+2+1)
self.intron.append(intron)
#print start,end
start=-1
end=-1
if self.intron==[]:
self.intron=[(0,0)]
return self.intron
def hunt_exon(self):
if self.intron==[]: # run hunt_intron() in advance
self.hunt_intron
elif self.intron==[(0,0)]: #no intron
self.exon=[(1, self.seq_len)]
#get exon
if self.intron[0][0]>1:
exon=(1, self.intron[0][0]-1)
self.exon.append(exon)
for i in range(len(self.intron)-1):
exon_start=self.intron[i][1]+1
exon_end=self.intron[i+1][0]-1
exon=(exon_start,exon_end)
self.exon.append(exon)
if self.intron[-1][1]<self.seq_len:
exon=(self.intron[-1][1]+1, self.seq_len)
self.exon.append(exon)
#
return self.exon
def cDNA_seq(self):
if self.exon==[]:
self.hunt_exon()
#
cDNA=''
exons=[]
for exon in self.exon:
start_index=exon[0]-1
end_index=exon[1]-1
exon_seq=self.seq[start_index:end_index]
cDNA +=exon_seq
exons.append(exon_seq)
#print exon_seq
return cDNA, exons
def intron_seq(self):
introns=[]
if self.intron==[] or self.intron==[(0,0)]:
pass
else:
for intron in self.intron:
start_index=intron[0]-1
end_index=intron[1]-1
introns.append(self.seq[start_index:end_index])
return introns
if __name__=="__main__":
seq='ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTATGCTATCATCGATCGATATCGATAGCATCGACTACTAT'
s=splicing(seq)
print 'intron:', s.hunt_intron()
print 'exon:', s.hunt_exon()
print 'cDNA:', s.cDNA_seq()[0]
print 'exon seq:', s.cDNA_seq()[1]
print 'intron seq:', s.intron_seq()
print 'ok'
No comments:
Post a Comment