Wednesday, July 29, 2015

python-bioinformatics: read fasta file

Abstract: read fasta file using mycode and biopython

The script:
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 28 11:42:06 2015

@author: yuan
"""
# GenBank                           gb|accession|locus
# EMBL Data Library                 emb|accession|locus
# DDBJ, DNA Database of Japan       dbj|accession|locus
# NBRF PIR                          pir||entry
# Protein Research Foundation       prf||name
# SWISS-PROT                        sp|accession|entry name
# Brookhaven Protein Data Bank      pdb|entry|chain
# Patents                           pat|country|number
# GenInfo Backbone Id               bbs|number
# General database identifier       gnl|database|identifier
# NCBI Reference Sequence           ref|accession|locus
# Local Sequence identifier         lcl|identifier


import re
import itertools

   
#mycode
def read_fasta(file):
    #store accession: sequences
    ref={}
    with open(file, 'r') as in_obj:
        for line1,line2 in itertools.izip_longest(*[in_obj]*2):
            #first line
            line1=line1.rstrip('\n')

            acc=0
            if line1.find('|')>0:
                items=line1.split('|')
                acc=items[1]
            else:
                acc=re.sub("^>", "", line1)
            #
            line2=line2.rstrip('\n')
            #ref information
            ref[acc]=line2
            print acc
    in_obj.close()
    return ref
#
def biopython_fasta(file):
    from Bio import SeqIO
    ref={}
    in_obj = open(file, "r")
    for record in SeqIO.parse(in_obj, "fasta") :
        acc, seq=record.id, record.seq
        ref[acc]=seq
        print acc
    in_obj.close()
    return ref

if __name__=="__main__":
    fa_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.fa'
    #fa_file='/home/yuan/mysql_pre/cookbooks/test.fa'
    #my code   
    ref=read_fasta(fa_file)
   
    #biopython
    ref=biopython_fasta(fa_file)
    print 'ok'

No comments:

Post a Comment