Friday, August 7, 2015

Bioinformatics in Python: hunt motif

Abstract: implement regular expression to hunt motif sequences

The script:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug  7 18:52:34 2015

@author: yuan
"""
import itertools
import re

class hunt_motif:
    def __init__(self, fasta_file):
        self.fa_file=fasta_file

    def read_fasta(self,FUN=None, *args):
        #store accession: sequences
        ref={}
        with open(self.fa_file, 'r') as in_obj:
            for line1,line2 in itertools.izip_longest(*[in_obj]*2):
                #first line
                line1=line1.rstrip('\n')
               
                acc=0
                if line1.find('|')>0:
                    items=line1.split('|')
                    acc=items[1]
                else:
                    acc=re.sub("^>", "", line1)
                #
                line2=line2.rstrip('\n')
                #ref information
                #print acc
                if FUN is None:
                    ref[acc]=line2
                else:
                    ref[acc]=FUN(line2,args[0])
        in_obj.close()
        return ref
       
    def search_motif(self, seq, motif):
        #print motif
        #print seq
        #runs = re.finditer(r"A([ATCG]{1})TC", seq)
        runs = re.finditer(r""+motif, seq)
        matches=[]
        for match in runs:
            start=match.start()
            end=match.end()
            #print start,end
            group=(start, end, seq[start:end])
            matches.append(group)
        print matches
        return matches

if __name__=="__main__":
    fa_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.fa'
    #fa_file='/home/yuan/mysql_pre/cookbooks/test.fa'
    h=hunt_motif(fa_file)

    #GA...ATG....AC
    motif="GA([ATGC]{3})ATG([ATGC]{4})AC"
    h.read_fasta(h.search_motif, motif)
   
    #AvaII
    h.read_fasta(h.search_motif, "GG(A|T)CC")

    #hunt AT-rich region
    motif="[AT]{20,200}"
    h.read_fasta(h.search_motif, motif)

    print 'ok'

No comments:

Post a Comment