Friday, August 7, 2015
Bioinformatics in Python: read the file in gtf/gff3 format
Abstract: read gtf file and extract genome annotations
GTF format:
Fields must be tab-separated into 9 columns. "empty" columns are denoted with a '.'
seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix.
source - name of the program that generated this feature, or the data source (database or project name)
feature - feature type name, e.g. Gene, Variation, Similarity
start - Start position of the feature, with sequence numbering starting at 1.
end - End position of the feature, with sequence numbering starting at 1.
score - A floating point value.
strand - defined as + (forward) or - (reverse).
frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..
attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.
The script below can extract annotations of exons, genes, or transcripts.
The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 6 08:25:58 2015
@author: yuan
"""
import re
class read_file:
def __init__(self, file):
self.file=file
self.gene_annot={}
#Implementation of perl's autovivification feature
def initiate_dict(self):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item] = type(self)()
return value
def read_gtf(self, feature_name='gene'):
feature_id=feature_name+'_id'
feature_annot={}
#n=0
#store accession: sequences
in_obj=open(self.file, 'r')
for line in in_obj:
annot={}
line=line.rstrip(';\n')
items=line.split("\t")
annot['chr']=items[0]
annot['source']=items[1]
annot['feature']=items[2]
annot['start']=items[3]
annot['end']=items[4]
annot['strand']=items[6]
#print annot['source']
attr=items[8].split("; ")
#print attr
for a in attr:
#print a
name,w, value=a.partition(" ")
annot[name]=re.sub('\"', '', value)
#print name, annot[name], "\n"
#n+=1
#if n==100:
# break
#
if annot['feature']==feature_name:
feature_annot[annot[feature_id]]=annot
in_obj.close()
return feature_annot
def get_annot(self, column, our_dir, feature='gene'):
print "get annotation of", feature
annot=self.read_gtf(feature)
out_file=out_dir+'exon_annotations.txt'
print "export to", out_file
out_obj=open(out_file, 'w')
#header line
header=list(columns)
header.insert(0,'id')
header='\t'.join(header)
out_obj.write(header+'\n')
#
for annot_dict in annot.keys():
out_obj.write(annot_dict+'\t')
for col in columns:
out_obj.write(annot[annot_dict][col]+'\t')
out_obj.write('\n')
out_obj.close()
#
return annot
#
if __name__=="__main__":
gtf_file='/home/yuan/eRNA/ref_seq/human_Ensembl_GRCh38R80.gtf'
r=read_file(gtf_file)
#gene=r.get_annot('gene')
#transcript=r.get_annot('transcript')
out_dir='/home/yuan/mysql_pre/cookbooks/'
out_columns=['chr','start','end','transcript_id','gene_id', 'gene_name', 'gene_biotype']
exon=r.get_annot('exon', out_columns, out_dir)
print 'ok'
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment