Wednesday, July 29, 2015

python-bioinformatics: read fastq file

Abstract : get coordinate, read sequences, and Q values from a fastq file using mycode  and biopython

The script:
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 28 11:42:06 2015

@author: yuan
"""
# GenBank                           gb|accession|locus
# EMBL Data Library                 emb|accession|locus
# DDBJ, DNA Database of Japan       dbj|accession|locus
# NBRF PIR                          pir||entry
# Protein Research Foundation       prf||name
# SWISS-PROT                        sp|accession|entry name
# Brookhaven Protein Data Bank      pdb|entry|chain
# Patents                           pat|country|number
# GenInfo Backbone Id               bbs|number
# General database identifier       gnl|database|identifier
# NCBI Reference Sequence           ref|accession|locus
# Local Sequence identifier         lcl|identifier


import re
import itertools
from Bio import SeqIO
   
#mycode
def read_fastq(file):
    #store accession: sequences
    reads={}
    readsQ={}
    with open(file, 'r') as in_obj:
        for line1, line2, line3, line4 in itertools.izip_longest(*[in_obj]*4):
            #first line
            line1=line1.rstrip('\n')
            items=line1.split()[0]
            items=items.split(':')
            coordinate=items[-2]+'-'+items[-1]
            print items, coordinate
            #second line: sequence
            seq=line2.rstrip('\n')
            print seq
            #fourth line:  Q value
            line4=line4.rstrip('\n')
            Q=map(ord, line4) #ASCII to decimal
            print Q
            #ref information
            reads[coordinate]=seq
            readsQ[coordinate]=Q
    in_obj.close()
    return reads, readsQ
#
def biopython_fastq(file):
    reads={}
    readsQ={}
    in_obj = open(file, "r")
    for record in SeqIO.parse(in_obj, "fastq") :
        id, seq, Q=record.id, record.seq, record.letter_annotations["phred_quality"]
        items=id.split(':')
        coordinate=items[-2]+'-'+items[-1]
        #
        reads[coordinate]=seq
        readsQ[coordinate]=Q
        print coordinate, seq, Q
    in_obj.close()
    return reads, readsQ

if __name__=="__main__":
    fq_file='/home/yuan/mysql_pre/cookbooks/test.fq'
    #my code   
    ref=read_fastq(fq_file)
   
    #biopython
    ref=biopython_fastq(fq_file)
    print 'ok'

No comments:

Post a Comment