Abstract : get coordinate, read sequences, and Q values from a fastq file using mycode and biopython
The script:
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 28 11:42:06 2015
@author: yuan
"""
# GenBank gb|accession|locus
# EMBL Data Library emb|accession|locus
# DDBJ, DNA Database of Japan dbj|accession|locus
# NBRF PIR pir||entry
# Protein Research Foundation prf||name
# SWISS-PROT sp|accession|entry name
# Brookhaven Protein Data Bank pdb|entry|chain
# Patents pat|country|number
# GenInfo Backbone Id bbs|number
# General database identifier gnl|database|identifier
# NCBI Reference Sequence ref|accession|locus
# Local Sequence identifier lcl|identifier
import re
import itertools
from Bio import SeqIO
#mycode
def read_fastq(file):
#store accession: sequences
reads={}
readsQ={}
with open(file, 'r') as in_obj:
for line1, line2, line3, line4 in itertools.izip_longest(*[in_obj]*4):
#first line
line1=line1.rstrip('\n')
items=line1.split()[0]
items=items.split(':')
coordinate=items[-2]+'-'+items[-1]
print items, coordinate
#second line: sequence
seq=line2.rstrip('\n')
print seq
#fourth line: Q value
line4=line4.rstrip('\n')
Q=map(ord, line4) #ASCII to decimal
print Q
#ref information
reads[coordinate]=seq
readsQ[coordinate]=Q
in_obj.close()
return reads, readsQ
#
def biopython_fastq(file):
reads={}
readsQ={}
in_obj = open(file, "r")
for record in SeqIO.parse(in_obj, "fastq") :
id, seq, Q=record.id, record.seq, record.letter_annotations["phred_quality"]
items=id.split(':')
coordinate=items[-2]+'-'+items[-1]
#
reads[coordinate]=seq
readsQ[coordinate]=Q
print coordinate, seq, Q
in_obj.close()
return reads, readsQ
if __name__=="__main__":
fq_file='/home/yuan/mysql_pre/cookbooks/test.fq'
#my code
ref=read_fastq(fq_file)
#biopython
ref=biopython_fastq(fq_file)
print 'ok'
No comments:
Post a Comment