Friday, September 4, 2015

Bioinformatics in Python: analyze SAM file

Abstract: SAM stands for sequence alignment determined by the aligner namely Bowtie.

The script:
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 26 17:41:09 2015

@author: yuan
"""

#methods related to sequence alignment
import os
import myIO
import subprocess
import re
#class: alignment

#
class alignment:

    def __init__(self, parameters):
        #parameters is a dictionary
        self.par=parameters
        #print self.par['tool_alinger'] #namely bowtie1
        #print self.par['dir_aligner'] #namely /home/yuan/eRNA/bowtie1/
        #print self.par['fa_genome'] #namely /home/yuan/eRNA/ref_seq/human.fa
        self.init_aligner_par()
       
    def init_aligner_par(self):
        if self.par['tool_aligner']=='bowtie1':
            self.par['bowtie_aligner']=self.par['dir_aligner']+'bowtie'
            self.par['bowtie_builder']=self.par['dir_aligner']+'bowtie-build'
        elif self.par['tool_aligner']=='bowtie2':
            self.par['bowtie_aligner']=self.par['dir_aligner']+'bowtie2'
            self.par['bowtie_builder']=self.par['dir_aligner']+'bowtie2-build'
        #print self.par['bowtie_aligner'], self.par['bowtie_builder']

        #bowtie index
        file_head=myIO.file_os(self.par['fa_genome']).file_name()[2]
        self.par['bowtie_index_name']=file_head
        self.par['bowtie_index']=self.par['dir_aligner']+file_head
        #print self.par['bowtie_index'], self.par['bowtie_index_name']
       
    def check_bowtie1(self, tail='.ebwt'):
        #file tail of the bowtie index files
        index_tails=['.1', '.2','.3','.4','.rev.1','.rev.2']
        index_tails=[ o+tail for o in index_tails]
        #   
        flag=1
        for tail in index_tails:
            index_file=''.join([self.par['bowtie_index'],tail])
            if not os.path.isfile(index_file):
                flag=0
                break
        return flag

    def check_bowtie2(self):
        tail='.bt2'
        flag=self.check_bowtie1(tail)
        return flag

    def build_bowtie_index(self):
        #check bowtie index
        flag=1
        if self.par['tool_aligner']=='bowtie1':
            flag=self.check_bowtie1()
        elif self.par['tool_aligner']=='bowtie2':
            flag=self.check_bowtie2()
        #build bowtie index   
        output='Bowtie index of %s exists' % self.par['fa_genome']
        if flag==0:
            print 'build bowtie index: %s from %s' % (self.par['bowtie_index'], self.par['fa_genome'])
            shell_command=' '.join([self.par['bowtie_builder'], self.par['fa_genome'], self.par['bowtie_index']])
            #print shell_command
            output=subprocess.Popen(shell_command, stdout=subprocess.PIPE, shell=True).stdout.read()
            #output=output.split("\n")
        print output
        return output
       
    def analyze_FLAG(self, FLAG):
        binary_str=format(int(FLAG), 'b')
        #fixed length 12 characters
        binary_str=''.join(['0']*(12-len(binary_str)))+binary_str
        #print sam_info['FLAG'],binary_str
        #
        flags={}
        #paired-end, single-end
        flags['paired']=binary_str[-1]
        #aligned, unaligned, multiple-aligned
        if binary_str[-3]==1:
            flags['aligned']='0' #unaligned
        elif binary_str[-9]==1:
            flags['aligned']='3' #multiple-alinged
        else:
            flags['aligned']='1'
        #reversed complementary
        flags['revcom']=binary_str[-5]
        #R1 or R2
        if binary_str[-8]==1:
            flags['end']='R2' #unaligned
        else:
            flags['end']='R1'
        return flags
    def analyze_CIGAR(self, CIGAR, offset):
        cigar_info={}
        exons=[int(offset)]
        insertions=[]
        deletions=[]
        #analyze CIGAR
        cigar_num=re.split(r'[A-Z]',CIGAR)[:-1]
        cigar_chr=re.findall(r'[A-Z]', CIGAR)
        for a, b in zip(cigar_num,cigar_chr):
            a=int(a)
            if b=='M':
                if len(exons)%2==1: #matches
                    exons.append(exons[-1]+a-1)
                else:
                    exons[-1] +=a
            elif b=='N': # intron
                exons.append(exons[-1]+a+1)
            elif b=='D': #deletion
                exons[-1] +=a
                deletions.append(exons[-1]+1)
                deletions.append(exons[-1]+a)
            elif b=='I': #insertion
                insertions.append(exons[-1])
        #print offset, exons
        #
        #for start, end in zip(exons[::2],exons[1::2]):
        #    print start, end
        cigar_info['exons']=exons
        cigar_info['insertions']='NA' if insertions==[] else insertions
        cigar_info['deletions']='NA' if deletions==[] else deletions
        print cigar_info
        return cigar_info
           
    def analyze_SAM(self, sam_line):
        #loop of the sam line
        items=sam_line.split("\t")
        sam_info={'qname':items[0],'FLAG':items[1],'ref':items[2],
                  'offset':items[3],'mapQ':items[4],'CIGAR':items[5],
                  'mate_ref':items[6],'mate_offset':items[7],
                  'fragment_size':items[8],'seq':items[9], 'Qual':items[10] }
        #optional fields
        for item in items[11:]:
            (tag,mid,value)=item.split(':')
            sam_info[tag]=value
            #print key,value
       
        #analyze sam info
        #coordinate from fastq file
        qname_list=sam_info['qname'].split(':')  
        if len(qname_list)>3 and qname_list[-2].isdigit() and qname_list[-1].isdigit():
            sam_info['coordinate']= '-'.join(qname_list[-2:])
        else:
            sam_info['coordinate']='NA'
        #print qname_list, sam_info['coordinate']
       
        #analyze FLAG
        flag_info=self.analyze_FLAG(sam_info['FLAG'])
        sam_info.update(flag_info)
       
        #analyze CIGAR
        cigar_info=self.analyze_CIGAR(sam_info['CIGAR'], sam_info['offset'])
        sam_info.update(cigar_info)
       
        return sam_info
#end





No comments:

Post a Comment