Abstract: SAM stands for sequence alignment determined by the aligner namely Bowtie.
The script:
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 26 17:41:09 2015
@author: yuan
"""
#methods related to sequence alignment
import os
import myIO
import subprocess
import re
#class: alignment
#
class alignment:
def __init__(self, parameters):
#parameters is a dictionary
self.par=parameters
#print self.par['tool_alinger'] #namely bowtie1
#print self.par['dir_aligner'] #namely /home/yuan/eRNA/bowtie1/
#print self.par['fa_genome'] #namely /home/yuan/eRNA/ref_seq/human.fa
self.init_aligner_par()
def init_aligner_par(self):
if self.par['tool_aligner']=='bowtie1':
self.par['bowtie_aligner']=self.par['dir_aligner']+'bowtie'
self.par['bowtie_builder']=self.par['dir_aligner']+'bowtie-build'
elif self.par['tool_aligner']=='bowtie2':
self.par['bowtie_aligner']=self.par['dir_aligner']+'bowtie2'
self.par['bowtie_builder']=self.par['dir_aligner']+'bowtie2-build'
#print self.par['bowtie_aligner'], self.par['bowtie_builder']
#bowtie index
file_head=myIO.file_os(self.par['fa_genome']).file_name()[2]
self.par['bowtie_index_name']=file_head
self.par['bowtie_index']=self.par['dir_aligner']+file_head
#print self.par['bowtie_index'], self.par['bowtie_index_name']
def check_bowtie1(self, tail='.ebwt'):
#file tail of the bowtie index files
index_tails=['.1', '.2','.3','.4','.rev.1','.rev.2']
index_tails=[ o+tail for o in index_tails]
#
flag=1
for tail in index_tails:
index_file=''.join([self.par['bowtie_index'],tail])
if not os.path.isfile(index_file):
flag=0
break
return flag
def check_bowtie2(self):
tail='.bt2'
flag=self.check_bowtie1(tail)
return flag
def build_bowtie_index(self):
#check bowtie index
flag=1
if self.par['tool_aligner']=='bowtie1':
flag=self.check_bowtie1()
elif self.par['tool_aligner']=='bowtie2':
flag=self.check_bowtie2()
#build bowtie index
output='Bowtie index of %s exists' % self.par['fa_genome']
if flag==0:
print 'build bowtie index: %s from %s' % (self.par['bowtie_index'], self.par['fa_genome'])
shell_command=' '.join([self.par['bowtie_builder'], self.par['fa_genome'], self.par['bowtie_index']])
#print shell_command
output=subprocess.Popen(shell_command, stdout=subprocess.PIPE, shell=True).stdout.read()
#output=output.split("\n")
print output
return output
def analyze_FLAG(self, FLAG):
binary_str=format(int(FLAG), 'b')
#fixed length 12 characters
binary_str=''.join(['0']*(12-len(binary_str)))+binary_str
#print sam_info['FLAG'],binary_str
#
flags={}
#paired-end, single-end
flags['paired']=binary_str[-1]
#aligned, unaligned, multiple-aligned
if binary_str[-3]==1:
flags['aligned']='0' #unaligned
elif binary_str[-9]==1:
flags['aligned']='3' #multiple-alinged
else:
flags['aligned']='1'
#reversed complementary
flags['revcom']=binary_str[-5]
#R1 or R2
if binary_str[-8]==1:
flags['end']='R2' #unaligned
else:
flags['end']='R1'
return flags
def analyze_CIGAR(self, CIGAR, offset):
cigar_info={}
exons=[int(offset)]
insertions=[]
deletions=[]
#analyze CIGAR
cigar_num=re.split(r'[A-Z]',CIGAR)[:-1]
cigar_chr=re.findall(r'[A-Z]', CIGAR)
for a, b in zip(cigar_num,cigar_chr):
a=int(a)
if b=='M':
if len(exons)%2==1: #matches
exons.append(exons[-1]+a-1)
else:
exons[-1] +=a
elif b=='N': # intron
exons.append(exons[-1]+a+1)
elif b=='D': #deletion
exons[-1] +=a
deletions.append(exons[-1]+1)
deletions.append(exons[-1]+a)
elif b=='I': #insertion
insertions.append(exons[-1])
#print offset, exons
#
#for start, end in zip(exons[::2],exons[1::2]):
# print start, end
cigar_info['exons']=exons
cigar_info['insertions']='NA' if insertions==[] else insertions
cigar_info['deletions']='NA' if deletions==[] else deletions
print cigar_info
return cigar_info
def analyze_SAM(self, sam_line):
#loop of the sam line
items=sam_line.split("\t")
sam_info={'qname':items[0],'FLAG':items[1],'ref':items[2],
'offset':items[3],'mapQ':items[4],'CIGAR':items[5],
'mate_ref':items[6],'mate_offset':items[7],
'fragment_size':items[8],'seq':items[9], 'Qual':items[10] }
#optional fields
for item in items[11:]:
(tag,mid,value)=item.split(':')
sam_info[tag]=value
#print key,value
#analyze sam info
#coordinate from fastq file
qname_list=sam_info['qname'].split(':')
if len(qname_list)>3 and qname_list[-2].isdigit() and qname_list[-1].isdigit():
sam_info['coordinate']= '-'.join(qname_list[-2:])
else:
sam_info['coordinate']='NA'
#print qname_list, sam_info['coordinate']
#analyze FLAG
flag_info=self.analyze_FLAG(sam_info['FLAG'])
sam_info.update(flag_info)
#analyze CIGAR
cigar_info=self.analyze_CIGAR(sam_info['CIGAR'], sam_info['offset'])
sam_info.update(cigar_info)
return sam_info
#end
No comments:
Post a Comment