Friday, August 7, 2015

Bioinformatics in Python: read the file in gtf/gff3 format


Abstract: read gtf file and extract genome annotations


GTF format:
Fields must be tab-separated into 9 columns. "empty" columns are denoted with a '.'
    seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix.
    source - name of the program that generated this feature, or the data source (database or project name)
    feature - feature type name, e.g. Gene, Variation, Similarity
    start - Start position of the feature, with sequence numbering starting at 1.
    end - End position of the feature, with sequence numbering starting at 1.
    score - A floating point value.
    strand - defined as + (forward) or - (reverse).
    frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
    attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.

The script below can extract annotations of exons, genes, or transcripts.

The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  6 08:25:58 2015

@author: yuan
"""
import re

class read_file:

    def __init__(self, file):
        self.file=file
        self.gene_annot={}

    #Implementation of perl's autovivification feature
    def initiate_dict(self):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value
           
    def read_gtf(self, feature_name='gene'):
        feature_id=feature_name+'_id'
        feature_annot={}
        #n=0
        #store accession: sequences
        in_obj=open(self.file, 'r')
        for line in in_obj:
            annot={}
            line=line.rstrip(';\n')
            items=line.split("\t")
            annot['chr']=items[0]
            annot['source']=items[1]
            annot['feature']=items[2]
            annot['start']=items[3]
            annot['end']=items[4]
            annot['strand']=items[6]
            #print annot['source']
            attr=items[8].split("; ")
            #print attr
            for a in attr:
                #print a
                name,w, value=a.partition(" ")
                annot[name]=re.sub('\"', '', value)
                #print name, annot[name], "\n"
            #n+=1
            #if n==100:
             #   break
            #
            if annot['feature']==feature_name:
                feature_annot[annot[feature_id]]=annot
        in_obj.close()
        return feature_annot
   
    def get_annot(self, column, our_dir, feature='gene'):
        print "get annotation of", feature
        annot=self.read_gtf(feature)
       
        out_file=out_dir+'exon_annotations.txt'
        print "export to", out_file
        out_obj=open(out_file, 'w')       
        #header line
        header=list(columns)
        header.insert(0,'id')
        header='\t'.join(header)
        out_obj.write(header+'\n')
        #
        for annot_dict in annot.keys():
            out_obj.write(annot_dict+'\t')
            for col in columns:
                out_obj.write(annot[annot_dict][col]+'\t')
            out_obj.write('\n')
        out_obj.close()
        #
        return annot
       
#
if __name__=="__main__":
    gtf_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.gtf'
    r=read_file(gtf_file)
    #gene=r.get_annot('gene')
    #transcript=r.get_annot('transcript')
    out_dir='/home/yuan/mysql_pre/cookbooks/'
    out_columns=['chr','start','end','transcript_id','gene_id', 'gene_name', 'gene_biotype']
    exon=r.get_annot('exon', out_columns, out_dir)
    print 'ok'

No comments:

Post a Comment