Abstract: implement regular expression to hunt motif sequences
The script:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 7 18:52:34 2015
@author: yuan
"""
import itertools
import re
class hunt_motif:
def __init__(self, fasta_file):
self.fa_file=fasta_file
def read_fasta(self,FUN=None, *args):
#store accession: sequences
ref={}
with open(self.fa_file, 'r') as in_obj:
for line1,line2 in itertools.izip_longest(*[in_obj]*2):
#first line
line1=line1.rstrip('\n')
acc=0
if line1.find('|')>0:
items=line1.split('|')
acc=items[1]
else:
acc=re.sub("^>", "", line1)
#
line2=line2.rstrip('\n')
#ref information
#print acc
if FUN is None:
ref[acc]=line2
else:
ref[acc]=FUN(line2,args[0])
in_obj.close()
return ref
def search_motif(self, seq, motif):
#print motif
#print seq
#runs = re.finditer(r"A([ATCG]{1})TC", seq)
runs = re.finditer(r""+motif, seq)
matches=[]
for match in runs:
start=match.start()
end=match.end()
#print start,end
group=(start, end, seq[start:end])
matches.append(group)
print matches
return matches
if __name__=="__main__":
fa_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.fa'
#fa_file='/home/yuan/mysql_pre/cookbooks/test.fa'
h=hunt_motif(fa_file)
#GA...ATG....AC
motif="GA([ATGC]{3})ATG([ATGC]{4})AC"
h.read_fasta(h.search_motif, motif)
#AvaII
h.read_fasta(h.search_motif, "GG(A|T)CC")
#hunt AT-rich region
motif="[AT]{20,200}"
h.read_fasta(h.search_motif, motif)
print 'ok'
No comments:
Post a Comment