Thursday, August 6, 2015

Bioinformatics in Python: Trim adapters

Abstract:Trim adapter’s sequence at the 3‘-end of reads

The points:
1.pass functions as a parameter into another function for reuse codes.
2. implement regular expression
3.consider more possible patterns when trim adapters.

The script:
# -*- coding: utf-8 -*-
"""
Created on Wed Aug  5 19:27:31 2015

@author: yuan
"""
import itertools
import re

class trim_adapter:
    def __init__(self,in_file, out_file):
        self.in_file=in_file
        self.out_file=out_file
        self.adapter3='TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCT'
        self.len_adapter3=len(self.adapter3)
       
    def read_fastq(self, FUN=None, *args):
        #store accession: sequences
        reads={}
        out_obj=open(self.out_file,'w')
        with open(self.in_file, 'r') as in_obj:
            for line1, line2, line3, line4 in itertools.izip_longest(*[in_obj]*4):
                #first line
                line1=line1.rstrip('\n')
                items=line1.split()[0]
                items=items.split(':')
                coordinate=items[-2]+'-'+items[-1]
                #print items, coordinate
                #second line: sequence
                seq=line2.rstrip('\n')
                #print seq
                #ref information
                if FUN is None:
                    reads[coordinate]=seq
                else:
                    reads[coordinate]=FUN(seq)
                    if len(reads[coordinate])>6:
                        print reads[coordinate]
                        out_obj.write(coordinate+'\t'+reads[coordinate]+'\n')
        in_obj.close()
        out_obj.close()
        return reads
   
    def trim_3end(self, seq):
        #print seq
        flag=0
        #all adapter sequence
        index=re.search(self.adapter3, seq)
        if index is not None:
            trimmed_seq=seq[0:index.start()]
            flag=1
        else:
            #partial adapter at the 3 end of seq
            for i in range(6,self.len_adapter3+1)[::-1]:
                part_adapter_seq=self.adapter3[0:i]
                #print part_adapter_seq
                index=re.search(r''+self.adapter3+r'\b', seq)
                if index is not None:
                    trimmed_seq=seq[0:index.start()]
                    #print '=', trimmed_seq
                    flag=1
                    break
        #
        if flag==1:
            seq=trimmed_seq
        #print seq
        return seq

if __name__=="__main__":
    fq_file='/home/yuan/mysql_pre/cookbooks/test.fq'
    out_file='/home/yuan/mysql_pre/cookbooks/test_trim.txt'
    #my code   
    t=trim_adapter(fq_file,out_file)
    t.read_fastq( t.trim_3end ) #Note should not be t.trim_3end()
    print 'ok'

No comments:

Post a Comment